diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2bf16.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2bf16.v2bf16.ll new file mode 100644 index 0000000000000..400c918bfb636 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2bf16.v2bf16.ll @@ -0,0 +1,2042 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2bf16_v2bf16__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v2bf16__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> poison + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__2_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v2bf16__2_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> zeroinitializer + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__u_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v2bf16__u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v2bf16__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v2bf16__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v2bf16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v2bf16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v2bf16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__u_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__0_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__1_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__2_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__3_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__3_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__3_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__3_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__3_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__u_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__0_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__1_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__2_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__u_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__0_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__1_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__2_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__u_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__0_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__1_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__2_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__u_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__0_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__1_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v2bf16__2_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v2bf16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v2bf16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v2bf16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2bf16.v3bf16.ll new file mode 100644 index 0000000000000..3621f80233562 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2bf16.v3bf16.ll @@ -0,0 +1,4437 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2bf16_v3bf16__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v3bf16__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> poison + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__3_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v3bf16__3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> zeroinitializer + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__u_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v3bf16__u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v3bf16__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v3bf16__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v3bf16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v3bf16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v3bf16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__u_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__0_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__1_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__2_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__3_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__4_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__5_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__5_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__5_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__5_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__5_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__5_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__5_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__u_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__0_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__1_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__2_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__3_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__4_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__u_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__0_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__1_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__2_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__3_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__4_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__u_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__0_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__1_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__2_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__3_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__4_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__u_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__0_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__1_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__2_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__3_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__4_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__u_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__0_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__1_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__2_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__3_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__4_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__u_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__0_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__1_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__2_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__3_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v3bf16__4_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v3bf16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v3bf16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v3bf16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2bf16.v4bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2bf16.v4bf16.ll new file mode 100644 index 0000000000000..c8715472ab19e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2bf16.v4bf16.ll @@ -0,0 +1,7377 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2bf16_v4bf16__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v4bf16__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> poison + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__4_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v4bf16__4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> zeroinitializer + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__u_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v4bf16__u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v4bf16__4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v4bf16__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__u_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__0_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__1_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__2_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__3_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__4_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__5_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__6_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__7_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__7_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__7_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__7_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__7_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__7_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__7_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__7_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__7_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__u_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__0_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__1_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__2_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__3_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__4_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__5_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__6_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__u_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__0_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__1_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__2_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__3_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__4_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__5_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__6_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__u_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__0_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__1_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__2_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__3_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__4_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__5_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__6_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__u_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__0_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__1_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__2_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__3_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__4_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__5_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__6_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__u_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__0_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__1_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__2_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__3_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__4_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__5_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__6_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__u_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__0_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__1_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__2_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__3_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__4_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__5_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__6_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__u_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__0_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__1_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__2_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__3_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__4_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__5_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__6_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__u_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__0_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__1_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__2_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__3_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__4_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__5_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v4bf16__6_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2bf16.v8bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2bf16.v8bf16.ll new file mode 100644 index 0000000000000..c7ccdbb2da463 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2bf16.v8bf16.ll @@ -0,0 +1,27671 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2bf16_v8bf16__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v8bf16__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> poison + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v8bf16__8_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v5, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v5, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v5, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v5, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v5, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v6, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v7, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v7, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v6, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v7, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v7, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v7, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v7, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v7, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v7, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v7, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v7, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__15_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__15_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> zeroinitializer + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v4, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v5, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v4, v0 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v4, v0 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v4, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v4, v0 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v5, v0 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v5, v0 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v4, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v4, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v4, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v4, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v4, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v4, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v5, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v4, v1 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v4, v1 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v4, v1 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v4, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v4, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v4, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v5, v1 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v5, v1 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v5, v1 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v4, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v4, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v4, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v5, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v5, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v4, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v5, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v6, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v6, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v6, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v6, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v6, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v7, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v7, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v4, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v4, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v4, v2 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v5, v2 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v5, v2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v4, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v5, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v5, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v5, v2 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v6, v2 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v6, v2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v5, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v6, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v6, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v6, v2 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v7, v2 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v7, v2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v4, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v4, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v4, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v5, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v5, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v5, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v5, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v5, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v5, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v6, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v6, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v6, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v6, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v6, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v6, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v7, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v7, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v7, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v4, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v4, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v4, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v5, v3 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v5, v3 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v5, v3 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v5, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v5, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v5, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v6, v3 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v6, v3 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v6, v3 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v6, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v6, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v6, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v7, v3 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v7, v3 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v7, v3 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v8bf16__u_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2bf16_v8bf16__8_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v4 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v4 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v4, v2, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v5, v2, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v5, v2, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v5, v3, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v5, v3, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v5, v3, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v5 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v5 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v5 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v5 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v5 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v4, v1, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v4, v1, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v4, v1, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v6, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v6, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v5, v2, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v6, v2, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v6, v2, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v6, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v6, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v6, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v6, v3, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v6, v3, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v6, v3, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v4 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v4 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v5 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v6 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v6 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v6, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v6, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v6 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v6 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v6 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v6, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v6, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v6, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v5, v1, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v5, v1, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v5, v1, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v6, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v7, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v7, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v6, v2, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v7, v2, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v7, v2, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v7, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v7, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v7, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v7, v3, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v7, v3, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v7, v3, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__u_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__0_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v5 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v5 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__1_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__2_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v5 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v5 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v5 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__3_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__4_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v6 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v7 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v7 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__5_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v6, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v7, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v7, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__6_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v7 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v7 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v7 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__7_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v7, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v7, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v7, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__8_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__9_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__9_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__10_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__10_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__11_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__11_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__12_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__13_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__13_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2bf16_v8bf16__14_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2bf16_v8bf16__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=v"() + %vec1 = call <8 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_u() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__15_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__15_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_0() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_1() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_2() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_3() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_4() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_5() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_6() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_7() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_8() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_9() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_10() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_11() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_12() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_13() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_14() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__u_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__0_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__1_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__2_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__3_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__4_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__5_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__6_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__7_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__8_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__9_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__9_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__10_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__10_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__11_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__11_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__12_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__13_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__13_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v2bf16_v8bf16__14_15() { +; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2bf16_v8bf16__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x bfloat> asm "; def $0", "=s"() + %vec1 = call <8 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x bfloat> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f16.v2f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f16.v2f16.ll new file mode 100644 index 0000000000000..a12b151ab9c73 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f16.v2f16.ll @@ -0,0 +1,2042 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2f16_v2f16__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v2f16__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> poison + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__2_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v2f16__2_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> zeroinitializer + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__u_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v2f16__u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v2f16__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v2f16__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v2f16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v2f16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v2f16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @s_shuffle_v2f16_v2f16__u_u() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__0_u() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__1_u() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__2_u() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__3_u() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__3_0() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__3_1() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__3_2() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__3_3() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__u_0() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__0_0() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__1_0() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__2_0() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__u_1() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__0_1() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__1_1() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__2_1() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__u_2() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__0_2() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__1_2() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__2_2() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__u_3() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__0_3() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__1_3() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v2f16__2_3() { +; GFX900-LABEL: s_shuffle_v2f16_v2f16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v2f16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v2f16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f16.v3f16.ll new file mode 100644 index 0000000000000..c31b33c52e61f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f16.v3f16.ll @@ -0,0 +1,4437 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2f16_v3f16__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v3f16__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> poison + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__3_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v3f16__3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> zeroinitializer + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__u_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v3f16__u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v3f16__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v3f16__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v3f16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v3f16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v3f16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @s_shuffle_v2f16_v3f16__u_u() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__0_u() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__1_u() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__2_u() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__3_u() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__4_u() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__5_u() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__5_0() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__5_1() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__5_2() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__5_3() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__5_4() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__5_5() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__u_0() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__0_0() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__1_0() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__2_0() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__3_0() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__4_0() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__u_1() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__0_1() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__1_1() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__2_1() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__3_1() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__4_1() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__u_2() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__0_2() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__1_2() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__2_2() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__3_2() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__4_2() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__u_3() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__0_3() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__1_3() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__2_3() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__3_3() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__4_3() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__u_4() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__0_4() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__1_4() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__2_4() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__3_4() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__4_4() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__u_5() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__0_5() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__1_5() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__2_5() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__3_5() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v3f16__4_5() { +; GFX900-LABEL: s_shuffle_v2f16_v3f16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v3f16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v3f16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f16.v4f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f16.v4f16.ll new file mode 100644 index 0000000000000..b59da88773c6c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f16.v4f16.ll @@ -0,0 +1,7377 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2f16_v4f16__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v4f16__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> poison + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__4_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v4f16__4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> zeroinitializer + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__u_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v4f16__u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v4f16__4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v4f16__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v4f16__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v4f16__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v4f16__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @s_shuffle_v2f16_v4f16__u_u() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__0_u() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__1_u() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__2_u() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__3_u() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__4_u() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__5_u() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__6_u() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__7_u() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__7_0() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__7_1() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__7_2() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__7_3() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__7_4() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__7_5() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__7_6() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__7_7() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__u_0() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__0_0() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__1_0() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__2_0() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__3_0() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__4_0() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__5_0() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__6_0() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__u_1() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__0_1() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__1_1() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__2_1() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__3_1() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__4_1() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__5_1() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__6_1() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__u_2() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__0_2() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__1_2() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__2_2() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__3_2() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__4_2() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__5_2() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__6_2() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__u_3() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__0_3() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__1_3() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__2_3() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__3_3() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__4_3() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__5_3() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__6_3() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__u_4() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__0_4() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__1_4() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__2_4() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__3_4() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__4_4() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__5_4() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__6_4() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__u_5() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__0_5() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__1_5() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__2_5() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__3_5() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__4_5() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__5_5() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__6_5() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__u_6() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__0_6() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__1_6() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__2_6() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__3_6() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__4_6() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__5_6() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__6_6() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__u_7() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__0_7() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__1_7() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__2_7() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__3_7() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__4_7() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__5_7() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v4f16__6_7() { +; GFX900-LABEL: s_shuffle_v2f16_v4f16__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v4f16__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v4f16__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f16.v8f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f16.v8f16.ll new file mode 100644 index 0000000000000..bc4f74e9d711a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f16.v8f16.ll @@ -0,0 +1,27671 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2f16_v8f16__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v8f16__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> poison + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v8f16__8_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v5, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v5, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v5, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v5, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v5, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v6, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v7, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v7, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v6, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v7, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v7, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v7, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v7, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v7, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v7, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v7, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v7, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__15_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__15_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__15_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__15_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> zeroinitializer + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v4, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v5, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v4, v0 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v4, v0 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v4, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v4, v0 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v5, v0 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v5, v0 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v4, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v4, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v4, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v4, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v4, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v4, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v5, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v4, v1 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v4, v1 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v4, v1 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v4, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v4, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v4, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v5, v1 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v5, v1 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v5, v1 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v4, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v4, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v4, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v5, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v5, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v4, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v5, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v6, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v6, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v6, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v6, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v6, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v7, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v7, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v4, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v4, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v4, v2 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v5, v2 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v5, v2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v4, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v5, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v5, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v5, v2 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v6, v2 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v6, v2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v5, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v6, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v6, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v6, v2 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v7, v2 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v7, v2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v4, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v4, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v4, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v5, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v5, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v5, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v5, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v5, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v5, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v6, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v6, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v6, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v6, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v6, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v6, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v7, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v7, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v7, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v4, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v4, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v4, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v5, v3 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v5, v3 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v5, v3 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v5, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v5, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v5, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v6, v3 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v6, v3 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v6, v3 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v6, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v6, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v6, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v7, v3 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v7, v3 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v7, v3 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v8f16__u_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f16_v8f16__8_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v4 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v4 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v4, v2, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v5, v2, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v5, v2, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v5, v3, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v5, v3, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v5, v3, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v5 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v5 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v5 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v5 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v5 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v4, v1, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v4, v1, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v4, v1, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v6, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v6, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v5, v2, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v6, v2, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v6, v2, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v6, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v6, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v6, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v6, v3, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v6, v3, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v6, v3, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v4 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v4 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v5 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v6 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v6 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v6, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v6, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v6 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v6 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v6 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v6, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v6, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v6, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v5, v1, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v5, v1, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v5, v1, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v6, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v7, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v7, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v6, v2, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v7, v2, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v7, v2, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v7, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v7, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v7, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v7, v3, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v7, v3, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v7, v3, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__u_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__0_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v5 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v5 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__1_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__2_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v5 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v5 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v5 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__3_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__4_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v6 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v7 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v7 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__5_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v6, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v7, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v7, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__6_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v7 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v7 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v7 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__7_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v7, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v7, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v7, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__8_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__9_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__9_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__9_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__9_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__10_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__10_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__10_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__10_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__11_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__11_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__11_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__11_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__12_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__13_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__13_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__13_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__13_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2f16_v8f16__14_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f16_v8f16__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f16_v8f16__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f16_v8f16__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=v"() + %vec1 = call <8 x half> asm "; def $0", "=v"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + store <2 x half> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_u() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__15_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__15_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__15_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__15_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_0() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_1() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_2() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_3() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_4() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_5() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_6() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_7() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_8() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_9() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_10() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_11() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_12() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_13() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_14() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__u_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__0_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__1_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__2_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__3_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__4_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__5_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__6_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__7_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__8_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__9_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__9_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__9_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__9_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__10_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__10_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__10_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__10_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__11_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__11_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__11_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__11_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__12_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__13_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__13_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__13_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__13_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} + +define void @s_shuffle_v2f16_v8f16__14_15() { +; GFX900-LABEL: s_shuffle_v2f16_v8f16__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f16_v8f16__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f16_v8f16__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x half> asm "; def $0", "=s"() + %vec1 = call <8 x half> asm "; def $0", "=s"() + %shuf = shufflevector <8 x half> %vec0, <8 x half> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x half> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll new file mode 100644 index 0000000000000..9ee4ddcea7e4b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll @@ -0,0 +1,1875 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2f32_v2f32__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v2f32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> poison + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__2_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v2f32__2_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> zeroinitializer + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__u_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v2f32__u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v2f32__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v2f32__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v2f32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v2f32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v2f32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v2f32_v2f32__u_u() { +; GFX9-LABEL: s_shuffle_v2f32_v2f32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__0_u() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__1_u() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__2_u() { +; GFX9-LABEL: s_shuffle_v2f32_v2f32__2_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__3_u() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__3_0() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__3_1() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__3_2() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__3_3() { +; GFX9-LABEL: s_shuffle_v2f32_v2f32__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__u_0() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__0_0() { +; GFX9-LABEL: s_shuffle_v2f32_v2f32__0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__1_0() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__2_0() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__u_1() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__0_1() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__1_1() { +; GFX9-LABEL: s_shuffle_v2f32_v2f32__1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__2_1() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__u_2() { +; GFX9-LABEL: s_shuffle_v2f32_v2f32__u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__0_2() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__1_2() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__2_2() { +; GFX9-LABEL: s_shuffle_v2f32_v2f32__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__u_3() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__0_3() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__1_3() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v2f32__2_3() { +; GFX900-LABEL: s_shuffle_v2f32_v2f32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v2f32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v2f32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll new file mode 100644 index 0000000000000..1dd2e3ee3f3f4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll @@ -0,0 +1,4236 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2f32_v3f32__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v3f32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> poison + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__3_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v3f32__3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> zeroinitializer + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__u_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v3f32__u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v3f32__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v5, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v5, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v3f32__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v3f32__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v3f32__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v3f32__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v2f32_v3f32__u_u() { +; GFX9-LABEL: s_shuffle_v2f32_v3f32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__0_u() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__1_u() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__2_u() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__3_u() { +; GFX9-LABEL: s_shuffle_v2f32_v3f32__3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__4_u() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__5_u() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__5_0() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__5_1() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__5_2() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__5_3() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__5_4() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__5_5() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__u_0() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__0_0() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__1_0() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__2_0() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__3_0() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__4_0() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__u_1() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__0_1() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__1_1() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__2_1() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__3_1() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__4_1() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__u_2() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__0_2() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__1_2() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__2_2() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__3_2() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__4_2() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__u_3() { +; GFX9-LABEL: s_shuffle_v2f32_v3f32__u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__0_3() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__1_3() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__2_3() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__3_3() { +; GFX9-LABEL: s_shuffle_v2f32_v3f32__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__4_3() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__u_4() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__0_4() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__1_4() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__2_4() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__3_4() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__4_4() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__u_5() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__0_5() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__1_5() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__2_5() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__3_5() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v3f32__4_5() { +; GFX900-LABEL: s_shuffle_v2f32_v3f32__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v3f32__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v3f32__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll new file mode 100644 index 0000000000000..3d862c3fe3a29 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll @@ -0,0 +1,6929 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2f32_v4f32__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v4f32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> poison + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__4_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v4f32__4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx2 v7, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> zeroinitializer + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v7, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx2 v7, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__u_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v4f32__u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v4f32__4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v4f32__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v4f32__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v4f32__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v4f32__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v2f32_v4f32__u_u() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__0_u() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__1_u() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__2_u() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__3_u() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__4_u() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__5_u() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__6_u() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__7_u() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__7_0() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__7_1() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__7_2() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__7_3() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__7_4() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__7_5() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__7_6() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__7_7() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__u_0() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__0_0() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__1_0() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__2_0() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__3_0() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__4_0() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__5_0() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__6_0() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__u_1() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__0_1() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__1_1() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__2_1() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__2_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__3_1() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__4_1() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__5_1() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__6_1() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__u_2() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__0_2() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__1_2() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__2_2() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__3_2() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__4_2() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__5_2() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__6_2() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__u_3() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__0_3() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__1_3() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__1_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__2_3() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__3_3() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__4_3() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__5_3() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__6_3() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__u_4() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__0_4() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__1_4() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__2_4() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__3_4() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__4_4() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__5_4() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__6_4() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__6_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__u_5() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__0_5() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__1_5() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__2_5() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__3_5() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__4_5() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__5_5() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__6_5() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__6_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__u_6() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__0_6() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__1_6() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__2_6() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__3_6() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__4_6() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__5_6() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__6_6() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__u_7() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__0_7() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__1_7() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__2_7() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__3_7() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__4_7() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__5_7() { +; GFX9-LABEL: s_shuffle_v2f32_v4f32__5_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v4f32__6_7() { +; GFX900-LABEL: s_shuffle_v2f32_v4f32__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v4f32__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v4f32__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll new file mode 100644 index 0000000000000..d285d6ce92256 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll @@ -0,0 +1,25924 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2f32_v8f32__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v8f32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> poison + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v8f32__8_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v8 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v9 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v9 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v10 +; GFX900-NEXT: global_store_dwordx2 v11, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v11 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v11 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v12 +; GFX900-NEXT: global_store_dwordx2 v13, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v13 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v13 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v14 +; GFX900-NEXT: global_store_dwordx2 v15, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v15 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v15 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v15 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__15_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__15_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__15_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__15_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> zeroinitializer + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v2 +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v2 +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v9 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v9 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v4 +; GFX940-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v4 +; GFX940-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v11 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v13, v4 +; GFX940-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v9 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v9 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v11 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v11 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v13, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v11, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v6 +; GFX940-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v11 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v13, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v13, v6 +; GFX940-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[12:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v13 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[13:14], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v15, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v15, v6 +; GFX940-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v9 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v9 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v11 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v11 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[13:14], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v13 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v13 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v15, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v15, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v8f32__u_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2f32_v8f32__8_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v8 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v8 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v8 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v10 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v10 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v8 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v9 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v10 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v8 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v8 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v10 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v10 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v12 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v12 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v8 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v9 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v10 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v11 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v12 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v10 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v10 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v12 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v12 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v14 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v14 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[13:14], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v14 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v14 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__u_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__0_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v8 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__1_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__2_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v10 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__3_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__4_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v12 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v13 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__5_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__6_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v14 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v15 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__7_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__8_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__9_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__9_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__9_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__9_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__10_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__10_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__10_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__10_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__11_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__11_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__11_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__11_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__12_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__13_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__13_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__13_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__13_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2f32_v8f32__14_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2f32_v8f32__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2f32_v8f32__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2f32_v8f32__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=v"() + %vec1 = call <8 x float> asm "; def $0", "=v"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + store <2 x float> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_u() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_u() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__8_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_u() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s11 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s15 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s15 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s11 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s11 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_3() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s19 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s19 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s15 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s19 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s19 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s15 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s15 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_7() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_8() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_11() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__15_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__15_15() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__15_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_0() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_0() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__6_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s9 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_0() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_1() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__2_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_1() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__6_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s13 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s13 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s9 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_1() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_2() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_2() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__6_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s6 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s13, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s13, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s9, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s9 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_2() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_3() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_3() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_3() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__1_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_3() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_3() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_3() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_3() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__5_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_3() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__6_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_3() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__7_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_3() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_3() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_3() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_3() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_3() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_3() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_3() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_4() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__2_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s12 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_4() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__6_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s9 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s11 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s17, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s17, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s13, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s17 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s17 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s13 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_4() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s12 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_5() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__2_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_5() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__6_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s13 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s13 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s9 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s11 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s17 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s17 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s13 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_5() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_6() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__2_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s14 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_6() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s9 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s11 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s17, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s17, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s13, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s13 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_6() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_7() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_7() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_7() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__1_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_7() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__2_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_7() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__3_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_7() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_7() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__5_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_7() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_7() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_7() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_7() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_7() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_7() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_7() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_7() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_7() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_8() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__u_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_8() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_8() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_8() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_8() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_8() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_8() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_8() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_8() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_8() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__8_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_8() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_8() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__10_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_8() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_8() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_8() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_8() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__14_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_9() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__10_9: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_9() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_9() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__14_9: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s14 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s14 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s10 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_10() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__10_10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_10() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_10() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__14_10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s6 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_11() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_11() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_11() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_11() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_11() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_11() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_11() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_11() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_11() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_11() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_11() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__9_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_11() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_11() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__11_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_11() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_11() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__13_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_11() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__14_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s12 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s12 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s8 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s16 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s16 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s12 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s12 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s12 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_12() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__10_12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s12 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_12() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_12() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__14_12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s13 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_13() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__10_13: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_13() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_13() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__14_13: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s18 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s18 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s14 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_14() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__10_14: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s14 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_14() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_14() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__14_14: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__u_15() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__0_15() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__1_15() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__2_15() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__3_15() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__4_15() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__5_15() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__6_15() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__7_15() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__8_15() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__9_15() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__9_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__10_15() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__10_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__11_15() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__11_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__12_15() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__13_15() { +; GFX9-LABEL: s_shuffle_v2f32_v8f32__13_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} + +define void @s_shuffle_v2f32_v8f32__14_15() { +; GFX900-LABEL: s_shuffle_v2f32_v8f32__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2f32_v8f32__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2f32_v8f32__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x float> asm "; def $0", "=s"() + %vec1 = call <8 x float> asm "; def $0", "=s"() + %shuf = shufflevector <8 x float> %vec0, <8 x float> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x float> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i16.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i16.v2i16.ll new file mode 100644 index 0000000000000..67bab8ac71963 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i16.v2i16.ll @@ -0,0 +1,2021 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2i16_v2i16__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v2i16__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> poison + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__2_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v2i16__2_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> zeroinitializer + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__u_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v2i16__u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v2i16__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v2i16__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v2i16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v2i16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v2i16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @s_shuffle_v2i16_v2i16__u_u() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__0_u() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__1_u() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__2_u() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__3_u() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__3_0() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__3_1() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__3_2() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__3_3() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__u_0() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__0_0() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__1_0() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__2_0() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__u_1() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__0_1() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__1_1() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__2_1() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__u_2() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__0_2() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__1_2() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__2_2() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__u_3() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__0_3() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__1_3() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v2i16__2_3() { +; GFX900-LABEL: s_shuffle_v2i16_v2i16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v2i16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v2i16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i16.v3i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i16.v3i16.ll new file mode 100644 index 0000000000000..563811fa2d05a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i16.v3i16.ll @@ -0,0 +1,4404 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2i16_v3i16__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v3i16__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> poison + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__3_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v3i16__3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> zeroinitializer + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__u_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v3i16__u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v3i16__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v3i16__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v3i16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v3i16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v3i16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @s_shuffle_v2i16_v3i16__u_u() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__0_u() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__1_u() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__2_u() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__3_u() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__4_u() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__5_u() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__5_0() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__5_1() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__5_2() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__5_3() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__5_4() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__5_5() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__u_0() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__0_0() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__1_0() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__2_0() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__3_0() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__4_0() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__u_1() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__0_1() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__1_1() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__2_1() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__3_1() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__4_1() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__u_2() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__0_2() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__1_2() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__2_2() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__3_2() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__4_2() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__u_3() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__0_3() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__1_3() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__2_3() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__3_3() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__4_3() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__u_4() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__0_4() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__1_4() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__2_4() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__3_4() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__4_4() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__u_5() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__0_5() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__1_5() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__2_5() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__3_5() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v3i16__4_5() { +; GFX900-LABEL: s_shuffle_v2i16_v3i16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v3i16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v3i16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i16.v4i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i16.v4i16.ll new file mode 100644 index 0000000000000..b44d65119d167 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i16.v4i16.ll @@ -0,0 +1,7263 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2i16_v4i16__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v4i16__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> poison + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__4_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v4i16__4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> zeroinitializer + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__u_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v4i16__u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v4i16__4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v4i16__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v4i16__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v4i16__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v4i16__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @s_shuffle_v2i16_v4i16__u_u() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__0_u() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__1_u() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__2_u() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__3_u() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__4_u() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__5_u() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__6_u() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__7_u() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__7_0() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__7_1() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__7_2() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__7_3() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__7_4() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__7_5() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__7_6() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__7_7() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__u_0() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__0_0() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__1_0() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__2_0() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__3_0() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__4_0() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__5_0() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__6_0() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__u_1() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__0_1() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__1_1() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__2_1() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__3_1() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__4_1() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__5_1() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__6_1() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__u_2() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__0_2() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__1_2() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__2_2() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__3_2() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__4_2() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__5_2() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__6_2() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__u_3() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__0_3() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__1_3() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__2_3() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__3_3() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__4_3() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__5_3() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__6_3() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__u_4() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__0_4() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__1_4() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__2_4() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__3_4() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__4_4() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__5_4() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__6_4() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__u_5() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__0_5() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__1_5() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__2_5() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__3_5() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__4_5() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__5_5() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__6_5() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__u_6() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__0_6() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__1_6() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__2_6() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__3_6() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__4_6() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__5_6() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__6_6() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__u_7() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__0_7() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__1_7() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__2_7() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__3_7() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__4_7() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__5_7() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v4i16__6_7() { +; GFX900-LABEL: s_shuffle_v2i16_v4i16__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v4i16__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v4i16__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i16.v8i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i16.v8i16.ll new file mode 100644 index 0000000000000..9d12e64088e72 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i16.v8i16.ll @@ -0,0 +1,27155 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2i16_v8i16__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v8i16__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> poison + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v8i16__8_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v5, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v5, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v5, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v5, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v5, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v6, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v7, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v7, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v6, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v7, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v7, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v7, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v7, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v7, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v7, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v7, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v7, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__15_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__15_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__15_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__15_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> zeroinitializer + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v4, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v5, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v4, v0 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v4, v0 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v4, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v4, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v4, v0 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v5, v0 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v5, v0 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v4, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v4, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v4, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v4, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v4, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v4, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v5, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v5, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v4, v1 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v4, v1 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v4, v1 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v4, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v4, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v4, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v5, v1 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v5, v1 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v5, v1 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v4, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v4, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v4, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v5, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v5, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v4, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v5, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v6, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v6, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v6, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v6, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v6, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v7, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v7, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v4, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v4, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v4, v2 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v5, v2 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v5, v2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v4, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v5, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v5, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v5, v2 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v6, v2 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v6, v2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v5, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v6, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v6, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v6, v2 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v7, v2 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v7, v2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v4, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v4, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v4, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v5, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v5, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v5, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v5, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v5, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v5, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v6, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v6, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v6, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v6, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v6, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v6, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v7, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v7, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v7, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v4, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v4, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v4, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v5, v3 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v5, v3 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v5, v3 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v5, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v5, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v5, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v6, v3 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v6, v3 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v6, v3 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v6, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v6, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v6, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v7, v3 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v7, v3 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v7, v3 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v8i16__u_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i16_v8i16__8_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v4 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v4 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v4, v2, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v5, v2, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v5, v2, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v5, v3, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v5, v3, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v5, v3, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v5 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v5 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v5 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v5 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v5 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v4, v1, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v4, v1, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v4, v1, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v6, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v6, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v5, v2, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v6, v2, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v6, v2, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v6, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v6, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v6, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v6, v3, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v6, v3, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v6, v3, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v4 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v4 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v4, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v4, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v5 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v6 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v6 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v6, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v6, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v6 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v6 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v6 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v6, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v6, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v6, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v5, v0, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v5, v1, 16 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v5, v1, 16 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v5, v1, 16 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v6, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v7, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v7, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v6, v2, 16 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v7, v2, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v7, v2, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v7, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v7, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v7, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v7, v3, 16 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v7, v3, 16 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v7, v3, 16 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v2, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__u_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__0_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v5 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v5 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__1_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v4, v0, s4 +; GFX900-NEXT: global_store_dword v5, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__2_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v5 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v5 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v5 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__3_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v5, v1, s4 +; GFX900-NEXT: global_store_dword v6, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v5, v1, s4 +; GFX90A-NEXT: global_store_dword v6, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v5, v1, s2 +; GFX940-NEXT: global_store_dword v6, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__4_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v6 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v7 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v7 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__5_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v6, v2, s4 +; GFX900-NEXT: global_store_dword v7, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v7, v2, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v7, v2, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__6_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v7 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v7 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v7 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__7_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v7, v3, s4 +; GFX900-NEXT: global_store_dword v8, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v7, v3, s4 +; GFX90A-NEXT: global_store_dword v8, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v7, v3, s2 +; GFX940-NEXT: global_store_dword v8, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__8_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__9_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__9_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__9_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__9_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__10_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__10_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__10_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__10_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__11_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__11_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__11_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__11_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__12_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v2, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v2, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__13_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__13_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__13_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v2, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__13_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v2, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @v_shuffle_v2i16_v8i16__14_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i16_v8i16__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i16_v8i16__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i16_v8i16__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=v"() + %vec1 = call <8 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_u() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s7, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__15_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__15_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__15_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__15_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_0() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s8, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s8, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s4, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_1() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_2() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s8, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s8, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s4, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_3() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_4() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s8, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s8, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s4, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s9, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s9, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s5, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s9, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s9, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s5, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_5() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s8, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s8, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s4, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s9, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s9, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s5, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s6, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_6() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s8, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s8, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s4, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_7() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_8() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_9() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_10() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_11() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_12() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s6 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s6 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_13() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_14() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s4, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s4, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s0, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__u_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__0_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__1_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__2_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__3_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__4_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__5_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__6_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__7_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__8_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__9_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__9_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__9_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__9_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__10_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__10_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__10_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__10_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__11_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__11_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__11_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__11_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__12_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s4, s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s0, s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__13_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__13_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s4, s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__13_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s4, s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__13_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s0, s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} + +define void @s_shuffle_v2i16_v8i16__14_15() { +; GFX900-LABEL: s_shuffle_v2i16_v8i16__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s7 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i16_v8i16__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s7 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i16_v8i16__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i16> asm "; def $0", "=s"() + %vec1 = call <8 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i16> %vec0, <8 x i16> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:10]}"(<2 x i16> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll new file mode 100644 index 0000000000000..f39a8eba9c19f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll @@ -0,0 +1,1875 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2i32_v2i32__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v2i32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> poison + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__2_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v2i32__2_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> zeroinitializer + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__u_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v2i32__u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v2i32__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v2i32__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v2i32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v2i32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v2i32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v2i32_v2i32__u_u() { +; GFX9-LABEL: s_shuffle_v2i32_v2i32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__0_u() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__1_u() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__2_u() { +; GFX9-LABEL: s_shuffle_v2i32_v2i32__2_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__3_u() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__3_0() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__3_1() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__3_2() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__3_3() { +; GFX9-LABEL: s_shuffle_v2i32_v2i32__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__u_0() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__0_0() { +; GFX9-LABEL: s_shuffle_v2i32_v2i32__0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__1_0() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__2_0() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__u_1() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__0_1() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__1_1() { +; GFX9-LABEL: s_shuffle_v2i32_v2i32__1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__2_1() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__u_2() { +; GFX9-LABEL: s_shuffle_v2i32_v2i32__u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__0_2() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__1_2() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__2_2() { +; GFX9-LABEL: s_shuffle_v2i32_v2i32__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__u_3() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__0_3() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__1_3() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v2i32__2_3() { +; GFX900-LABEL: s_shuffle_v2i32_v2i32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v2i32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v2i32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll new file mode 100644 index 0000000000000..c13317fb72f2f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll @@ -0,0 +1,4236 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2i32_v3i32__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v3i32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> poison + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__3_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v3i32__3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> zeroinitializer + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__u_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v3i32__u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v3i32__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v5, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v5, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v3i32__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v3i32__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v3i32__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v3i32__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v2i32_v3i32__u_u() { +; GFX9-LABEL: s_shuffle_v2i32_v3i32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__0_u() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__1_u() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__2_u() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__3_u() { +; GFX9-LABEL: s_shuffle_v2i32_v3i32__3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__4_u() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__5_u() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__5_0() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__5_1() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__5_2() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__5_3() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__5_4() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__5_5() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__u_0() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__0_0() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__1_0() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__2_0() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__3_0() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__4_0() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__u_1() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__0_1() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__1_1() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__2_1() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__3_1() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__4_1() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__u_2() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__0_2() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__1_2() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__2_2() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__3_2() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__4_2() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__u_3() { +; GFX9-LABEL: s_shuffle_v2i32_v3i32__u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__0_3() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__1_3() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__2_3() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__3_3() { +; GFX9-LABEL: s_shuffle_v2i32_v3i32__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__4_3() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__u_4() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__0_4() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__1_4() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__2_4() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__3_4() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__4_4() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__u_5() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__0_5() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__1_5() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__2_5() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__3_5() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v3i32__4_5() { +; GFX900-LABEL: s_shuffle_v2i32_v3i32__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v3i32__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v3i32__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll new file mode 100644 index 0000000000000..9c8db5d590aaf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll @@ -0,0 +1,6929 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2i32_v4i32__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v4i32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> poison + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__4_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v4i32__4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx2 v7, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> zeroinitializer + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v7, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx2 v7, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__u_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v4i32__u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v4i32__4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v4i32__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v4i32__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v4i32__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v4i32__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v2i32_v4i32__u_u() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__0_u() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__1_u() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__2_u() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__3_u() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__4_u() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__5_u() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__6_u() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__7_u() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__7_0() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__7_1() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__7_2() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__7_3() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__7_4() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__7_5() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__7_6() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__7_7() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__u_0() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__0_0() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__1_0() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__2_0() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__3_0() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__4_0() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__5_0() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__6_0() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__u_1() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__0_1() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__1_1() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__2_1() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__2_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__3_1() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__4_1() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__5_1() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__6_1() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__u_2() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__0_2() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__1_2() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__2_2() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__3_2() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__4_2() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__5_2() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__6_2() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__u_3() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__0_3() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__1_3() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__1_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__2_3() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__3_3() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__4_3() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__5_3() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__6_3() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__u_4() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__0_4() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__1_4() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__2_4() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__3_4() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__4_4() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__5_4() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__6_4() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__6_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__u_5() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__0_5() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__1_5() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__2_5() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__3_5() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__4_5() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__5_5() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__6_5() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__6_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__u_6() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__0_6() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__1_6() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__2_6() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__3_6() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__4_6() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__5_6() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__6_6() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__u_7() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__0_7() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__1_7() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__2_7() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__3_7() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__4_7() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__5_7() { +; GFX9-LABEL: s_shuffle_v2i32_v4i32__5_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v4i32__6_7() { +; GFX900-LABEL: s_shuffle_v2i32_v4i32__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v4i32__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v4i32__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll new file mode 100644 index 0000000000000..5cc870b1aa805 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll @@ -0,0 +1,25924 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2i32_v8i32__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v8i32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> poison + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v8i32__8_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v8 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v9 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v9 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v10 +; GFX900-NEXT: global_store_dwordx2 v11, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v11 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v11 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v12 +; GFX900-NEXT: global_store_dwordx2 v13, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v13 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v13 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v14 +; GFX900-NEXT: global_store_dwordx2 v15, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v15 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v15 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v15 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__15_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__15_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__15_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__15_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> zeroinitializer + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v2 +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v2 +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v9 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v9 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v4 +; GFX940-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v4 +; GFX940-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v11 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v13, v4 +; GFX940-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v9 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v9 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v11 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v11 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v13, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v11, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v6 +; GFX940-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v11 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v13, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v13, v6 +; GFX940-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[12:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v13 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[13:14], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v15, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v15, v6 +; GFX940-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v9 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v9 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v11 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v11 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[13:14], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v13 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v13 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v15, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v15, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v8i32__u_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i32_v8i32__8_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v8 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v8 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v8 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v10 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v10 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v8 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v9 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v10 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v8 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v8 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v10 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v10 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v12 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v12 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v8 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v9 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v10 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v11 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v12 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v10 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v10 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v12 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v12 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v14 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v14 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[13:14], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v14 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v14 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__u_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__0_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v8 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__1_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__2_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v10 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__3_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__4_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v12 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v13 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__5_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__6_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v14 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v15 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__7_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__8_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__9_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__9_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__9_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__9_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__10_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__10_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__10_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__10_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__11_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__11_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__11_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__11_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__12_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__13_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__13_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__13_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__13_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2i32_v8i32__14_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i32_v8i32__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i32_v8i32__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i32_v8i32__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=v"() + %vec1 = call <8 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + store <2 x i32> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_u() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_u() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__8_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_u() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s11 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s15 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s15 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s11 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s11 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_3() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s19 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s19 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s15 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s19 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s19 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s15 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s15 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_7() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_8() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_11() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__15_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__15_15() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__15_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_0() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_0() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__6_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s9 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_0() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_1() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__2_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_1() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__6_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s13 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s13 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s9 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_1() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_2() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_2() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__6_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s6 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s13, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s13, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s9, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s9 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_2() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_3() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_3() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_3() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__1_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_3() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_3() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_3() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_3() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__5_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_3() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__6_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_3() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__7_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_3() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_3() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_3() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_3() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_3() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_3() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_3() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_4() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__2_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s12 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_4() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__6_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s9 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s11 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s17, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s17, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s13, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s17 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s17 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s13 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_4() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s12 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_5() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__2_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_5() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__6_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s13 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s13 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s9 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s11 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s17 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s17 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s13 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_5() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_6() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__2_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s14 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_6() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s9 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s11 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s17, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s17, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s13, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s13 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_6() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_7() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_7() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_7() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__1_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_7() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__2_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_7() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__3_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_7() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_7() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__5_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_7() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_7() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_7() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_7() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_7() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_7() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_7() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_7() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_7() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_8() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__u_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_8() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_8() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_8() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_8() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_8() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_8() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_8() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_8() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_8() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__8_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_8() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_8() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__10_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_8() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_8() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_8() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_8() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__14_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_9() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__10_9: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_9() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_9() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__14_9: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s14 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s14 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s10 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_10() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__10_10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_10() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_10() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__14_10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s6 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_11() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_11() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_11() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_11() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_11() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_11() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_11() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_11() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_11() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_11() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_11() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__9_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_11() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_11() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__11_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_11() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_11() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__13_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_11() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__14_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s12 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s12 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s8 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s16 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s16 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s12 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s12 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s12 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_12() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__10_12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s12 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_12() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_12() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__14_12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s13 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_13() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__10_13: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_13() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_13() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__14_13: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s18 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s18 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s14 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_14() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__10_14: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s14 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_14() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_14() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__14_14: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__u_15() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__0_15() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__1_15() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__2_15() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__3_15() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__4_15() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__5_15() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__6_15() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__7_15() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__8_15() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__9_15() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__9_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__10_15() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__10_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__11_15() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__11_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__12_15() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__13_15() { +; GFX9-LABEL: s_shuffle_v2i32_v8i32__13_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} + +define void @s_shuffle_v2i32_v8i32__14_15() { +; GFX900-LABEL: s_shuffle_v2i32_v8i32__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i32_v8i32__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i32_v8i32__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i32> asm "; def $0", "=s"() + %vec1 = call <8 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i32> %vec0, <8 x i32> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x i32> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll new file mode 100644 index 0000000000000..185b173e8fee5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll @@ -0,0 +1,2104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2i64_v2i64__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v2i64__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> poison + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__2_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v2i64__2_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> zeroinitializer + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__u_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v2i64__u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v2i64__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v2i64__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v2i64__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v2i64__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v2i64__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v2i64_v2i64__u_u() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__0_u() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__1_u() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__2_u() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__3_u() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__3_0() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__3_1() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__3_2() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__3_3() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__u_0() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__0_0() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__1_0() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__2_0() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__u_1() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__0_1() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__1_1() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__2_1() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__u_2() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__0_2() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__1_2() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__2_2() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__u_3() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__0_3() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__1_3() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v2i64__2_3() { +; GFX900-LABEL: s_shuffle_v2i64_v2i64__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v2i64__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v2i64__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll new file mode 100644 index 0000000000000..9dd55867fdf91 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll @@ -0,0 +1,4469 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2i64_v3i64__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v3i64__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> poison + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__3_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v3i64__3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> zeroinitializer + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__u_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v3i64__u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v3i64__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v3i64__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v3i64__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v3i64__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v3i64__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v2i64_v3i64__u_u() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__0_u() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__1_u() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__2_u() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__3_u() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__4_u() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__5_u() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__5_0() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__5_1() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__5_2() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__5_3() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__5_4() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__5_5() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__u_0() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__0_0() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__1_0() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__2_0() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__3_0() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__4_0() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__u_1() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__0_1() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__1_1() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__2_1() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__3_1() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__4_1() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__u_2() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__0_2() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__1_2() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__2_2() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__3_2() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__4_2() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__u_3() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__0_3() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__1_3() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__2_3() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__3_3() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__4_3() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__u_4() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__0_4() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__1_4() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__2_4() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__3_4() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__4_4() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__u_5() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__0_5() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__1_5() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__2_5() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__3_5() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v3i64__4_5() { +; GFX900-LABEL: s_shuffle_v2i64_v3i64__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v3i64__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v3i64__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll new file mode 100644 index 0000000000000..3e7f0d057b6d1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll @@ -0,0 +1,7547 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2i64_v4i64__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v4i64__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> poison + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__4_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v4i64__4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> zeroinitializer + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v2 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v4 +; GFX940-NEXT: v_mov_b32_e32 v13, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v6 +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, v6 +; GFX90A-NEXT: v_mov_b32_e32 v15, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v6 +; GFX940-NEXT: v_mov_b32_e32 v15, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__u_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v4i64__u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v4i64__4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v6 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v4i64__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v4i64__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v4i64__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v4i64__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v2i64_v4i64__u_u() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__0_u() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__1_u() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__2_u() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__3_u() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__4_u() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__5_u() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__6_u() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__7_u() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__7_0() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__7_1() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__7_2() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__7_3() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__7_4() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__7_5() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__7_6() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__7_7() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__u_0() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__0_0() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__1_0() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__2_0() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__3_0() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__4_0() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__5_0() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__6_0() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__u_1() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__0_1() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__1_1() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__2_1() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__3_1() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__4_1() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__5_1() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__6_1() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__u_2() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__0_2() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__1_2() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__2_2() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__3_2() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__4_2() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__5_2() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__6_2() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__u_3() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__0_3() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__1_3() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__2_3() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__3_3() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__4_3() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__5_3() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__6_3() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__u_4() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__0_4() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__1_4() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__2_4() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__3_4() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__4_4() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__5_4() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__6_4() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__u_5() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__0_5() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__1_5() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__2_5() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__3_5() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__4_5() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__5_5() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__6_5() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__u_6() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__0_6() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__1_6() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__2_6() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__3_6() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__4_6() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__5_6() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__6_6() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__u_7() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__0_7() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__1_7() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__2_7() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s14 +; GFX940-NEXT: s_mov_b32 s7, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__3_7() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__4_7() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__5_7() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v4i64__6_7() { +; GFX900-LABEL: s_shuffle_v2i64_v4i64__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v4i64__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v4i64__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll new file mode 100644 index 0000000000000..5c6d62cb1649d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll @@ -0,0 +1,31395 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2i64_v8i64__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v8i64__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> poison + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v8i64__8_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, 0 +; GFX900-NEXT: v_mov_b32_e32 v16, v14 +; GFX900-NEXT: v_mov_b32_e32 v17, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, v0 +; GFX900-NEXT: v_mov_b32_e32 v19, v1 +; GFX900-NEXT: global_store_dwordx4 v20, v[16:19], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v16 +; GFX90A-NEXT: v_mov_b32_e32 v3, v17 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v18, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v16 +; GFX940-NEXT: v_mov_b32_e32 v3, v17 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v18, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v18 +; GFX900-NEXT: v_mov_b32_e32 v1, v19 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v18 +; GFX90A-NEXT: v_mov_b32_e32 v1, v19 +; GFX90A-NEXT: global_store_dwordx4 v20, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v18 +; GFX940-NEXT: v_mov_b32_e32 v1, v19 +; GFX940-NEXT: global_store_dwordx4 v20, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v20 +; GFX900-NEXT: v_mov_b32_e32 v3, v21 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v20 +; GFX90A-NEXT: v_mov_b32_e32 v3, v21 +; GFX90A-NEXT: global_store_dwordx4 v22, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v20 +; GFX940-NEXT: v_mov_b32_e32 v3, v21 +; GFX940-NEXT: global_store_dwordx4 v22, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v22 +; GFX900-NEXT: v_mov_b32_e32 v5, v23 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v22 +; GFX90A-NEXT: v_mov_b32_e32 v5, v23 +; GFX90A-NEXT: global_store_dwordx4 v24, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v22 +; GFX940-NEXT: v_mov_b32_e32 v5, v23 +; GFX940-NEXT: global_store_dwordx4 v24, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v24 +; GFX900-NEXT: v_mov_b32_e32 v7, v25 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v24 +; GFX90A-NEXT: v_mov_b32_e32 v7, v25 +; GFX90A-NEXT: global_store_dwordx4 v26, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v24 +; GFX940-NEXT: v_mov_b32_e32 v7, v25 +; GFX940-NEXT: global_store_dwordx4 v26, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v26 +; GFX900-NEXT: v_mov_b32_e32 v9, v27 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v26 +; GFX90A-NEXT: v_mov_b32_e32 v9, v27 +; GFX90A-NEXT: global_store_dwordx4 v28, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v26 +; GFX940-NEXT: v_mov_b32_e32 v9, v27 +; GFX940-NEXT: global_store_dwordx4 v28, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v28 +; GFX900-NEXT: v_mov_b32_e32 v11, v29 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v28 +; GFX90A-NEXT: v_mov_b32_e32 v11, v29 +; GFX90A-NEXT: global_store_dwordx4 v30, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v28 +; GFX940-NEXT: v_mov_b32_e32 v11, v29 +; GFX940-NEXT: global_store_dwordx4 v30, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v30 +; GFX900-NEXT: v_mov_b32_e32 v13, v31 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v30 +; GFX90A-NEXT: v_mov_b32_e32 v13, v31 +; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v30 +; GFX940-NEXT: v_mov_b32_e32 v13, v31 +; GFX940-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v14 +; GFX900-NEXT: v_mov_b32_e32 v7, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v14 +; GFX90A-NEXT: v_mov_b32_e32 v7, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v14 +; GFX940-NEXT: v_mov_b32_e32 v7, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v14 +; GFX940-NEXT: v_mov_b32_e32 v9, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v14 +; GFX900-NEXT: v_mov_b32_e32 v11, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v14 +; GFX90A-NEXT: v_mov_b32_e32 v11, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v14 +; GFX940-NEXT: v_mov_b32_e32 v11, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__15_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__15_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__15_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__15_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> zeroinitializer + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v0 +; GFX940-NEXT: v_mov_b32_e32 v11, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v0 +; GFX900-NEXT: v_mov_b32_e32 v13, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v0 +; GFX940-NEXT: v_mov_b32_e32 v13, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v0 +; GFX900-NEXT: v_mov_b32_e32 v15, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v0 +; GFX940-NEXT: v_mov_b32_e32 v15, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v18, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v18, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v18, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: global_store_dwordx4 v18, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v18, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v0 +; GFX940-NEXT: v_mov_b32_e32 v11, v1 +; GFX940-NEXT: global_store_dwordx4 v18, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v0 +; GFX900-NEXT: v_mov_b32_e32 v13, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v1 +; GFX90A-NEXT: global_store_dwordx4 v18, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v0 +; GFX940-NEXT: v_mov_b32_e32 v13, v1 +; GFX940-NEXT: global_store_dwordx4 v18, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v0 +; GFX900-NEXT: v_mov_b32_e32 v15, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v1 +; GFX90A-NEXT: global_store_dwordx4 v18, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v0 +; GFX940-NEXT: v_mov_b32_e32 v15, v1 +; GFX940-NEXT: global_store_dwordx4 v18, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, v0 +; GFX900-NEXT: v_mov_b32_e32 v17, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v17, v1 +; GFX90A-NEXT: global_store_dwordx4 v18, v[14:17], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v16, v0 +; GFX940-NEXT: v_mov_b32_e32 v17, v1 +; GFX940-NEXT: global_store_dwordx4 v18, v[14:17], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v2 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v2 +; GFX900-NEXT: v_mov_b32_e32 v13, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v2 +; GFX90A-NEXT: v_mov_b32_e32 v13, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v2 +; GFX940-NEXT: v_mov_b32_e32 v13, v3 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v2 +; GFX900-NEXT: v_mov_b32_e32 v15, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v2 +; GFX90A-NEXT: v_mov_b32_e32 v15, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v2 +; GFX940-NEXT: v_mov_b32_e32 v15, v3 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v20, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v20, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v20, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v2 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: global_store_dwordx4 v20, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v2 +; GFX900-NEXT: v_mov_b32_e32 v13, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v2 +; GFX90A-NEXT: v_mov_b32_e32 v13, v3 +; GFX90A-NEXT: global_store_dwordx4 v20, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v2 +; GFX940-NEXT: v_mov_b32_e32 v13, v3 +; GFX940-NEXT: global_store_dwordx4 v20, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v2 +; GFX900-NEXT: v_mov_b32_e32 v15, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v2 +; GFX90A-NEXT: v_mov_b32_e32 v15, v3 +; GFX90A-NEXT: global_store_dwordx4 v20, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v2 +; GFX940-NEXT: v_mov_b32_e32 v15, v3 +; GFX940-NEXT: global_store_dwordx4 v20, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, v2 +; GFX900-NEXT: v_mov_b32_e32 v17, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v17, v3 +; GFX90A-NEXT: global_store_dwordx4 v20, v[14:17], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v16, v2 +; GFX940-NEXT: v_mov_b32_e32 v17, v3 +; GFX940-NEXT: global_store_dwordx4 v20, v[14:17], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, v2 +; GFX900-NEXT: v_mov_b32_e32 v19, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v2 +; GFX90A-NEXT: v_mov_b32_e32 v19, v3 +; GFX90A-NEXT: global_store_dwordx4 v20, v[16:19], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v18, v2 +; GFX940-NEXT: v_mov_b32_e32 v19, v3 +; GFX940-NEXT: global_store_dwordx4 v20, v[16:19], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v4 +; GFX940-NEXT: v_mov_b32_e32 v13, v5 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v4 +; GFX900-NEXT: v_mov_b32_e32 v15, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v4 +; GFX90A-NEXT: v_mov_b32_e32 v15, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v4 +; GFX940-NEXT: v_mov_b32_e32 v15, v5 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v22, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v22, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 +; GFX90A-NEXT: global_store_dwordx4 v22, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v4 +; GFX940-NEXT: v_mov_b32_e32 v13, v5 +; GFX940-NEXT: global_store_dwordx4 v22, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v4 +; GFX900-NEXT: v_mov_b32_e32 v15, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v4 +; GFX90A-NEXT: v_mov_b32_e32 v15, v5 +; GFX90A-NEXT: global_store_dwordx4 v22, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v4 +; GFX940-NEXT: v_mov_b32_e32 v15, v5 +; GFX940-NEXT: global_store_dwordx4 v22, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, v4 +; GFX900-NEXT: v_mov_b32_e32 v17, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v4 +; GFX90A-NEXT: v_mov_b32_e32 v17, v5 +; GFX90A-NEXT: global_store_dwordx4 v22, v[14:17], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v16, v4 +; GFX940-NEXT: v_mov_b32_e32 v17, v5 +; GFX940-NEXT: global_store_dwordx4 v22, v[14:17], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, v4 +; GFX900-NEXT: v_mov_b32_e32 v19, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v4 +; GFX90A-NEXT: v_mov_b32_e32 v19, v5 +; GFX90A-NEXT: global_store_dwordx4 v22, v[16:19], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v18, v4 +; GFX940-NEXT: v_mov_b32_e32 v19, v5 +; GFX940-NEXT: global_store_dwordx4 v22, v[16:19], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, v4 +; GFX900-NEXT: v_mov_b32_e32 v21, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v4 +; GFX90A-NEXT: v_mov_b32_e32 v21, v5 +; GFX90A-NEXT: global_store_dwordx4 v22, v[18:21], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v20, v4 +; GFX940-NEXT: v_mov_b32_e32 v21, v5 +; GFX940-NEXT: global_store_dwordx4 v22, v[18:21], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v6 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v6 +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v6 +; GFX90A-NEXT: v_mov_b32_e32 v15, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v6 +; GFX940-NEXT: v_mov_b32_e32 v15, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v24, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v24, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v6 +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v6 +; GFX90A-NEXT: v_mov_b32_e32 v15, v7 +; GFX90A-NEXT: global_store_dwordx4 v24, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v6 +; GFX940-NEXT: v_mov_b32_e32 v15, v7 +; GFX940-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, v6 +; GFX900-NEXT: v_mov_b32_e32 v17, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v6 +; GFX90A-NEXT: v_mov_b32_e32 v17, v7 +; GFX90A-NEXT: global_store_dwordx4 v24, v[14:17], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v16, v6 +; GFX940-NEXT: v_mov_b32_e32 v17, v7 +; GFX940-NEXT: global_store_dwordx4 v24, v[14:17], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, v6 +; GFX900-NEXT: v_mov_b32_e32 v19, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v6 +; GFX90A-NEXT: v_mov_b32_e32 v19, v7 +; GFX90A-NEXT: global_store_dwordx4 v24, v[16:19], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v18, v6 +; GFX940-NEXT: v_mov_b32_e32 v19, v7 +; GFX940-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, v6 +; GFX900-NEXT: v_mov_b32_e32 v21, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v6 +; GFX90A-NEXT: v_mov_b32_e32 v21, v7 +; GFX90A-NEXT: global_store_dwordx4 v24, v[18:21], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v20, v6 +; GFX940-NEXT: v_mov_b32_e32 v21, v7 +; GFX940-NEXT: global_store_dwordx4 v24, v[18:21], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v22, v6 +; GFX900-NEXT: v_mov_b32_e32 v23, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v6 +; GFX90A-NEXT: v_mov_b32_e32 v23, v7 +; GFX90A-NEXT: global_store_dwordx4 v24, v[20:23], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v22, v6 +; GFX940-NEXT: v_mov_b32_e32 v23, v7 +; GFX940-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v8 +; GFX940-NEXT: v_mov_b32_e32 v11, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v8 +; GFX900-NEXT: v_mov_b32_e32 v13, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v8 +; GFX90A-NEXT: v_mov_b32_e32 v13, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v8 +; GFX940-NEXT: v_mov_b32_e32 v13, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v8 +; GFX900-NEXT: v_mov_b32_e32 v15, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v8 +; GFX90A-NEXT: v_mov_b32_e32 v15, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v8 +; GFX940-NEXT: v_mov_b32_e32 v15, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v14 +; GFX900-NEXT: v_mov_b32_e32 v7, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v14 +; GFX90A-NEXT: v_mov_b32_e32 v7, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v14 +; GFX940-NEXT: v_mov_b32_e32 v7, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v8 +; GFX900-NEXT: v_mov_b32_e32 v15, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v8 +; GFX90A-NEXT: v_mov_b32_e32 v15, v9 +; GFX90A-NEXT: global_store_dwordx4 v26, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v8 +; GFX940-NEXT: v_mov_b32_e32 v15, v9 +; GFX940-NEXT: global_store_dwordx4 v26, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, v8 +; GFX900-NEXT: v_mov_b32_e32 v17, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v8 +; GFX90A-NEXT: v_mov_b32_e32 v17, v9 +; GFX90A-NEXT: global_store_dwordx4 v26, v[14:17], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v16, v8 +; GFX940-NEXT: v_mov_b32_e32 v17, v9 +; GFX940-NEXT: global_store_dwordx4 v26, v[14:17], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, v8 +; GFX900-NEXT: v_mov_b32_e32 v19, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v8 +; GFX90A-NEXT: v_mov_b32_e32 v19, v9 +; GFX90A-NEXT: global_store_dwordx4 v26, v[16:19], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v18, v8 +; GFX940-NEXT: v_mov_b32_e32 v19, v9 +; GFX940-NEXT: global_store_dwordx4 v26, v[16:19], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, v8 +; GFX900-NEXT: v_mov_b32_e32 v21, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v8 +; GFX90A-NEXT: v_mov_b32_e32 v21, v9 +; GFX90A-NEXT: global_store_dwordx4 v26, v[18:21], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v20, v8 +; GFX940-NEXT: v_mov_b32_e32 v21, v9 +; GFX940-NEXT: global_store_dwordx4 v26, v[18:21], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v22, v8 +; GFX900-NEXT: v_mov_b32_e32 v23, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v8 +; GFX90A-NEXT: v_mov_b32_e32 v23, v9 +; GFX90A-NEXT: global_store_dwordx4 v26, v[20:23], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v22, v8 +; GFX940-NEXT: v_mov_b32_e32 v23, v9 +; GFX940-NEXT: global_store_dwordx4 v26, v[20:23], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v24, v8 +; GFX900-NEXT: v_mov_b32_e32 v25, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[22:25], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v8 +; GFX90A-NEXT: v_mov_b32_e32 v25, v9 +; GFX90A-NEXT: global_store_dwordx4 v26, v[22:25], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v24, v8 +; GFX940-NEXT: v_mov_b32_e32 v25, v9 +; GFX940-NEXT: global_store_dwordx4 v26, v[22:25], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v10 +; GFX940-NEXT: v_mov_b32_e32 v13, v11 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v10 +; GFX900-NEXT: v_mov_b32_e32 v15, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v10 +; GFX90A-NEXT: v_mov_b32_e32 v15, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v10 +; GFX940-NEXT: v_mov_b32_e32 v15, v11 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v14 +; GFX940-NEXT: v_mov_b32_e32 v9, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, v10 +; GFX900-NEXT: v_mov_b32_e32 v17, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, v10 +; GFX90A-NEXT: v_mov_b32_e32 v17, v11 +; GFX90A-NEXT: global_store_dwordx4 v28, v[14:17], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v16, v10 +; GFX940-NEXT: v_mov_b32_e32 v17, v11 +; GFX940-NEXT: global_store_dwordx4 v28, v[14:17], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, v10 +; GFX900-NEXT: v_mov_b32_e32 v19, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v10 +; GFX90A-NEXT: v_mov_b32_e32 v19, v11 +; GFX90A-NEXT: global_store_dwordx4 v28, v[16:19], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v18, v10 +; GFX940-NEXT: v_mov_b32_e32 v19, v11 +; GFX940-NEXT: global_store_dwordx4 v28, v[16:19], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, v10 +; GFX900-NEXT: v_mov_b32_e32 v21, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v10 +; GFX90A-NEXT: v_mov_b32_e32 v21, v11 +; GFX90A-NEXT: global_store_dwordx4 v28, v[18:21], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v20, v10 +; GFX940-NEXT: v_mov_b32_e32 v21, v11 +; GFX940-NEXT: global_store_dwordx4 v28, v[18:21], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v22, v10 +; GFX900-NEXT: v_mov_b32_e32 v23, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v10 +; GFX90A-NEXT: v_mov_b32_e32 v23, v11 +; GFX90A-NEXT: global_store_dwordx4 v28, v[20:23], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v22, v10 +; GFX940-NEXT: v_mov_b32_e32 v23, v11 +; GFX940-NEXT: global_store_dwordx4 v28, v[20:23], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v24, v10 +; GFX900-NEXT: v_mov_b32_e32 v25, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[22:25], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v10 +; GFX90A-NEXT: v_mov_b32_e32 v25, v11 +; GFX90A-NEXT: global_store_dwordx4 v28, v[22:25], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v24, v10 +; GFX940-NEXT: v_mov_b32_e32 v25, v11 +; GFX940-NEXT: global_store_dwordx4 v28, v[22:25], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v26, v10 +; GFX900-NEXT: v_mov_b32_e32 v27, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[24:27], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v10 +; GFX90A-NEXT: v_mov_b32_e32 v27, v11 +; GFX90A-NEXT: global_store_dwordx4 v28, v[24:27], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v26, v10 +; GFX940-NEXT: v_mov_b32_e32 v27, v11 +; GFX940-NEXT: global_store_dwordx4 v28, v[24:27], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v12 +; GFX900-NEXT: v_mov_b32_e32 v5, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v12 +; GFX90A-NEXT: v_mov_b32_e32 v5, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v12 +; GFX940-NEXT: v_mov_b32_e32 v5, v13 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v12 +; GFX940-NEXT: v_mov_b32_e32 v9, v13 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v12 +; GFX900-NEXT: v_mov_b32_e32 v15, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v12 +; GFX90A-NEXT: v_mov_b32_e32 v15, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v12 +; GFX940-NEXT: v_mov_b32_e32 v15, v13 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v14 +; GFX900-NEXT: v_mov_b32_e32 v11, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v14 +; GFX90A-NEXT: v_mov_b32_e32 v11, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v14 +; GFX940-NEXT: v_mov_b32_e32 v11, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v18, v12 +; GFX900-NEXT: v_mov_b32_e32 v19, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[16:19], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, v12 +; GFX90A-NEXT: v_mov_b32_e32 v19, v13 +; GFX90A-NEXT: global_store_dwordx4 v30, v[16:19], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v18, v12 +; GFX940-NEXT: v_mov_b32_e32 v19, v13 +; GFX940-NEXT: global_store_dwordx4 v30, v[16:19], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, v12 +; GFX900-NEXT: v_mov_b32_e32 v21, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, v12 +; GFX90A-NEXT: v_mov_b32_e32 v21, v13 +; GFX90A-NEXT: global_store_dwordx4 v30, v[18:21], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v20, v12 +; GFX940-NEXT: v_mov_b32_e32 v21, v13 +; GFX940-NEXT: global_store_dwordx4 v30, v[18:21], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v22, v12 +; GFX900-NEXT: v_mov_b32_e32 v23, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, v12 +; GFX90A-NEXT: v_mov_b32_e32 v23, v13 +; GFX90A-NEXT: global_store_dwordx4 v30, v[20:23], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v22, v12 +; GFX940-NEXT: v_mov_b32_e32 v23, v13 +; GFX940-NEXT: global_store_dwordx4 v30, v[20:23], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v24, v12 +; GFX900-NEXT: v_mov_b32_e32 v25, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[22:25], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v24, v12 +; GFX90A-NEXT: v_mov_b32_e32 v25, v13 +; GFX90A-NEXT: global_store_dwordx4 v30, v[22:25], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v24, v12 +; GFX940-NEXT: v_mov_b32_e32 v25, v13 +; GFX940-NEXT: global_store_dwordx4 v30, v[22:25], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v26, v12 +; GFX900-NEXT: v_mov_b32_e32 v27, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[24:27], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v26, v12 +; GFX90A-NEXT: v_mov_b32_e32 v27, v13 +; GFX90A-NEXT: global_store_dwordx4 v30, v[24:27], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v26, v12 +; GFX940-NEXT: v_mov_b32_e32 v27, v13 +; GFX940-NEXT: global_store_dwordx4 v30, v[24:27], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v28, v12 +; GFX900-NEXT: v_mov_b32_e32 v29, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[26:29], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v28, v12 +; GFX90A-NEXT: v_mov_b32_e32 v29, v13 +; GFX90A-NEXT: global_store_dwordx4 v30, v[26:29], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v28, v12 +; GFX940-NEXT: v_mov_b32_e32 v29, v13 +; GFX940-NEXT: global_store_dwordx4 v30, v[26:29], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v14 +; GFX900-NEXT: v_mov_b32_e32 v7, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v14 +; GFX90A-NEXT: v_mov_b32_e32 v7, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v14 +; GFX940-NEXT: v_mov_b32_e32 v7, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v14 +; GFX940-NEXT: v_mov_b32_e32 v9, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v14 +; GFX900-NEXT: v_mov_b32_e32 v11, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v14 +; GFX90A-NEXT: v_mov_b32_e32 v11, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v14 +; GFX940-NEXT: v_mov_b32_e32 v11, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v20, v14 +; GFX900-NEXT: v_mov_b32_e32 v21, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[18:21], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, v14 +; GFX90A-NEXT: v_mov_b32_e32 v21, v15 +; GFX90A-NEXT: global_store_dwordx4 v32, v[18:21], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v20, v14 +; GFX940-NEXT: v_mov_b32_e32 v21, v15 +; GFX940-NEXT: global_store_dwordx4 v32, v[18:21], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v22, v14 +; GFX900-NEXT: v_mov_b32_e32 v23, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[20:23], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, v14 +; GFX90A-NEXT: v_mov_b32_e32 v23, v15 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v22, v14 +; GFX940-NEXT: v_mov_b32_e32 v23, v15 +; GFX940-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v24, v14 +; GFX900-NEXT: v_mov_b32_e32 v25, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[22:25], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, v14 +; GFX90A-NEXT: v_mov_b32_e32 v25, v15 +; GFX90A-NEXT: global_store_dwordx4 v32, v[22:25], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v24, v14 +; GFX940-NEXT: v_mov_b32_e32 v25, v15 +; GFX940-NEXT: global_store_dwordx4 v32, v[22:25], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v26, v14 +; GFX900-NEXT: v_mov_b32_e32 v27, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[24:27], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, v14 +; GFX90A-NEXT: v_mov_b32_e32 v27, v15 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v26, v14 +; GFX940-NEXT: v_mov_b32_e32 v27, v15 +; GFX940-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v28, v14 +; GFX900-NEXT: v_mov_b32_e32 v29, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[26:29], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, v14 +; GFX90A-NEXT: v_mov_b32_e32 v29, v15 +; GFX90A-NEXT: global_store_dwordx4 v32, v[26:29], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v28, v14 +; GFX940-NEXT: v_mov_b32_e32 v29, v15 +; GFX940-NEXT: global_store_dwordx4 v32, v[26:29], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v30, v14 +; GFX900-NEXT: v_mov_b32_e32 v31, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[28:31], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, v14 +; GFX90A-NEXT: v_mov_b32_e32 v31, v15 +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v30, v14 +; GFX940-NEXT: v_mov_b32_e32 v31, v15 +; GFX940-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v8i64__u_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2i64_v8i64__8_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v0 +; GFX940-NEXT: v_mov_b32_e32 v11, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v0 +; GFX900-NEXT: v_mov_b32_e32 v13, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v0 +; GFX940-NEXT: v_mov_b32_e32 v13, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v0 +; GFX900-NEXT: v_mov_b32_e32 v15, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v0 +; GFX90A-NEXT: v_mov_b32_e32 v15, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v0 +; GFX940-NEXT: v_mov_b32_e32 v15, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v16 +; GFX900-NEXT: v_mov_b32_e32 v15, v17 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v16 +; GFX90A-NEXT: v_mov_b32_e32 v15, v17 +; GFX90A-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v16 +; GFX940-NEXT: v_mov_b32_e32 v15, v17 +; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v16 +; GFX900-NEXT: v_mov_b32_e32 v1, v17 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, v14 +; GFX90A-NEXT: v_mov_b32_e32 v17, v15 +; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v16, v14 +; GFX940-NEXT: v_mov_b32_e32 v17, v15 +; GFX940-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v2 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v2 +; GFX900-NEXT: v_mov_b32_e32 v13, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v2 +; GFX90A-NEXT: v_mov_b32_e32 v13, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v2 +; GFX940-NEXT: v_mov_b32_e32 v13, v3 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v2 +; GFX900-NEXT: v_mov_b32_e32 v15, v3 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v2 +; GFX90A-NEXT: v_mov_b32_e32 v15, v3 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v2 +; GFX940-NEXT: v_mov_b32_e32 v15, v3 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v12 +; GFX940-NEXT: v_mov_b32_e32 v9, v13 +; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v14 +; GFX900-NEXT: v_mov_b32_e32 v11, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v14 +; GFX90A-NEXT: v_mov_b32_e32 v11, v15 +; GFX90A-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v14 +; GFX940-NEXT: v_mov_b32_e32 v11, v15 +; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v16 +; GFX900-NEXT: v_mov_b32_e32 v13, v17 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v16 +; GFX90A-NEXT: v_mov_b32_e32 v13, v17 +; GFX90A-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v16 +; GFX940-NEXT: v_mov_b32_e32 v13, v17 +; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v18 +; GFX900-NEXT: v_mov_b32_e32 v15, v19 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v18 +; GFX90A-NEXT: v_mov_b32_e32 v15, v19 +; GFX90A-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v18 +; GFX940-NEXT: v_mov_b32_e32 v15, v19 +; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v16 +; GFX900-NEXT: v_mov_b32_e32 v3, v17 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, v14 +; GFX90A-NEXT: v_mov_b32_e32 v19, v15 +; GFX90A-NEXT: global_store_dwordx4 v32, v[18:21], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v18, v14 +; GFX940-NEXT: v_mov_b32_e32 v19, v15 +; GFX940-NEXT: global_store_dwordx4 v32, v[18:21], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v4 +; GFX940-NEXT: v_mov_b32_e32 v13, v5 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v4 +; GFX900-NEXT: v_mov_b32_e32 v15, v5 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v4 +; GFX90A-NEXT: v_mov_b32_e32 v15, v5 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v4 +; GFX940-NEXT: v_mov_b32_e32 v15, v5 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v14 +; GFX940-NEXT: v_mov_b32_e32 v9, v15 +; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v16 +; GFX900-NEXT: v_mov_b32_e32 v11, v17 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v16 +; GFX90A-NEXT: v_mov_b32_e32 v11, v17 +; GFX90A-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v16 +; GFX940-NEXT: v_mov_b32_e32 v11, v17 +; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v18 +; GFX900-NEXT: v_mov_b32_e32 v13, v19 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v18 +; GFX90A-NEXT: v_mov_b32_e32 v13, v19 +; GFX90A-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v18 +; GFX940-NEXT: v_mov_b32_e32 v13, v19 +; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v20 +; GFX900-NEXT: v_mov_b32_e32 v15, v21 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v20 +; GFX90A-NEXT: v_mov_b32_e32 v15, v21 +; GFX90A-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v20 +; GFX940-NEXT: v_mov_b32_e32 v15, v21 +; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v16 +; GFX900-NEXT: v_mov_b32_e32 v5, v17 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, v14 +; GFX90A-NEXT: v_mov_b32_e32 v21, v15 +; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v20, v14 +; GFX940-NEXT: v_mov_b32_e32 v21, v15 +; GFX940-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v6 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v6 +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v6 +; GFX90A-NEXT: v_mov_b32_e32 v15, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v6 +; GFX940-NEXT: v_mov_b32_e32 v15, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v12 +; GFX900-NEXT: v_mov_b32_e32 v5, v13 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v12 +; GFX90A-NEXT: v_mov_b32_e32 v5, v13 +; GFX90A-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v12 +; GFX940-NEXT: v_mov_b32_e32 v5, v13 +; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v14 +; GFX900-NEXT: v_mov_b32_e32 v7, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v14 +; GFX90A-NEXT: v_mov_b32_e32 v7, v15 +; GFX90A-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v14 +; GFX940-NEXT: v_mov_b32_e32 v7, v15 +; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v16 +; GFX900-NEXT: v_mov_b32_e32 v9, v17 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v16 +; GFX90A-NEXT: v_mov_b32_e32 v9, v17 +; GFX90A-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v16 +; GFX940-NEXT: v_mov_b32_e32 v9, v17 +; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v18 +; GFX900-NEXT: v_mov_b32_e32 v11, v19 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v18 +; GFX90A-NEXT: v_mov_b32_e32 v11, v19 +; GFX90A-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v18 +; GFX940-NEXT: v_mov_b32_e32 v11, v19 +; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v20 +; GFX900-NEXT: v_mov_b32_e32 v13, v21 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v20 +; GFX90A-NEXT: v_mov_b32_e32 v13, v21 +; GFX90A-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v20 +; GFX940-NEXT: v_mov_b32_e32 v13, v21 +; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v22 +; GFX900-NEXT: v_mov_b32_e32 v15, v23 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v22 +; GFX90A-NEXT: v_mov_b32_e32 v15, v23 +; GFX90A-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v22 +; GFX940-NEXT: v_mov_b32_e32 v15, v23 +; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v16 +; GFX900-NEXT: v_mov_b32_e32 v7, v17 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, v14 +; GFX90A-NEXT: v_mov_b32_e32 v23, v15 +; GFX90A-NEXT: global_store_dwordx4 v32, v[22:25], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v22, v14 +; GFX940-NEXT: v_mov_b32_e32 v23, v15 +; GFX940-NEXT: global_store_dwordx4 v32, v[22:25], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v8 +; GFX940-NEXT: v_mov_b32_e32 v11, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v8 +; GFX900-NEXT: v_mov_b32_e32 v13, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v8 +; GFX90A-NEXT: v_mov_b32_e32 v13, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v8 +; GFX940-NEXT: v_mov_b32_e32 v13, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v8 +; GFX900-NEXT: v_mov_b32_e32 v15, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v8 +; GFX90A-NEXT: v_mov_b32_e32 v15, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v8 +; GFX940-NEXT: v_mov_b32_e32 v15, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v16 +; GFX900-NEXT: v_mov_b32_e32 v7, v17 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v16 +; GFX90A-NEXT: v_mov_b32_e32 v7, v17 +; GFX90A-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v16 +; GFX940-NEXT: v_mov_b32_e32 v7, v17 +; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v18 +; GFX900-NEXT: v_mov_b32_e32 v9, v19 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v18 +; GFX90A-NEXT: v_mov_b32_e32 v9, v19 +; GFX90A-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v18 +; GFX940-NEXT: v_mov_b32_e32 v9, v19 +; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v20 +; GFX900-NEXT: v_mov_b32_e32 v11, v21 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v20 +; GFX90A-NEXT: v_mov_b32_e32 v11, v21 +; GFX90A-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v20 +; GFX940-NEXT: v_mov_b32_e32 v11, v21 +; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v22 +; GFX900-NEXT: v_mov_b32_e32 v13, v23 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v22 +; GFX90A-NEXT: v_mov_b32_e32 v13, v23 +; GFX90A-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v22 +; GFX940-NEXT: v_mov_b32_e32 v13, v23 +; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v24 +; GFX900-NEXT: v_mov_b32_e32 v15, v25 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v24 +; GFX90A-NEXT: v_mov_b32_e32 v15, v25 +; GFX90A-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v24 +; GFX940-NEXT: v_mov_b32_e32 v15, v25 +; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v16 +; GFX900-NEXT: v_mov_b32_e32 v9, v17 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, v14 +; GFX90A-NEXT: v_mov_b32_e32 v25, v15 +; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v24, v14 +; GFX940-NEXT: v_mov_b32_e32 v25, v15 +; GFX940-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v10 +; GFX940-NEXT: v_mov_b32_e32 v13, v11 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v10 +; GFX900-NEXT: v_mov_b32_e32 v15, v11 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v10 +; GFX90A-NEXT: v_mov_b32_e32 v15, v11 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v10 +; GFX940-NEXT: v_mov_b32_e32 v15, v11 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v16 +; GFX900-NEXT: v_mov_b32_e32 v5, v17 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v16 +; GFX90A-NEXT: v_mov_b32_e32 v5, v17 +; GFX90A-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v16 +; GFX940-NEXT: v_mov_b32_e32 v5, v17 +; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v18 +; GFX900-NEXT: v_mov_b32_e32 v7, v19 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v18 +; GFX90A-NEXT: v_mov_b32_e32 v7, v19 +; GFX90A-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v18 +; GFX940-NEXT: v_mov_b32_e32 v7, v19 +; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v20 +; GFX900-NEXT: v_mov_b32_e32 v9, v21 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v20 +; GFX90A-NEXT: v_mov_b32_e32 v9, v21 +; GFX90A-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v20 +; GFX940-NEXT: v_mov_b32_e32 v9, v21 +; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v22 +; GFX900-NEXT: v_mov_b32_e32 v11, v23 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v22 +; GFX90A-NEXT: v_mov_b32_e32 v11, v23 +; GFX90A-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v22 +; GFX940-NEXT: v_mov_b32_e32 v11, v23 +; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v24 +; GFX900-NEXT: v_mov_b32_e32 v13, v25 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v24 +; GFX90A-NEXT: v_mov_b32_e32 v13, v25 +; GFX90A-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v24 +; GFX940-NEXT: v_mov_b32_e32 v13, v25 +; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v26 +; GFX900-NEXT: v_mov_b32_e32 v15, v27 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v26 +; GFX90A-NEXT: v_mov_b32_e32 v15, v27 +; GFX90A-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v26 +; GFX940-NEXT: v_mov_b32_e32 v15, v27 +; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v16 +; GFX900-NEXT: v_mov_b32_e32 v11, v17 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, v14 +; GFX90A-NEXT: v_mov_b32_e32 v27, v15 +; GFX90A-NEXT: global_store_dwordx4 v32, v[26:29], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v26, v14 +; GFX940-NEXT: v_mov_b32_e32 v27, v15 +; GFX940-NEXT: global_store_dwordx4 v32, v[26:29], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v12 +; GFX900-NEXT: v_mov_b32_e32 v5, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v12 +; GFX90A-NEXT: v_mov_b32_e32 v5, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v12 +; GFX940-NEXT: v_mov_b32_e32 v5, v13 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v12 +; GFX940-NEXT: v_mov_b32_e32 v9, v13 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v12 +; GFX900-NEXT: v_mov_b32_e32 v15, v13 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v12 +; GFX90A-NEXT: v_mov_b32_e32 v15, v13 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v12 +; GFX940-NEXT: v_mov_b32_e32 v15, v13 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__u_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__0_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v16 +; GFX900-NEXT: v_mov_b32_e32 v3, v17 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v16 +; GFX90A-NEXT: v_mov_b32_e32 v3, v17 +; GFX90A-NEXT: global_store_dwordx4 v18, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v18, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v16 +; GFX940-NEXT: v_mov_b32_e32 v3, v17 +; GFX940-NEXT: global_store_dwordx4 v18, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__1_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v18 +; GFX900-NEXT: v_mov_b32_e32 v5, v19 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v18 +; GFX90A-NEXT: v_mov_b32_e32 v5, v19 +; GFX90A-NEXT: global_store_dwordx4 v20, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v20, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v18 +; GFX940-NEXT: v_mov_b32_e32 v5, v19 +; GFX940-NEXT: global_store_dwordx4 v20, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__2_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v20 +; GFX900-NEXT: v_mov_b32_e32 v7, v21 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v20 +; GFX90A-NEXT: v_mov_b32_e32 v7, v21 +; GFX90A-NEXT: global_store_dwordx4 v22, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v22, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:21] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v20 +; GFX940-NEXT: v_mov_b32_e32 v7, v21 +; GFX940-NEXT: global_store_dwordx4 v22, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__3_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v22 +; GFX900-NEXT: v_mov_b32_e32 v9, v23 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v22 +; GFX90A-NEXT: v_mov_b32_e32 v9, v23 +; GFX90A-NEXT: global_store_dwordx4 v24, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v24, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v22 +; GFX940-NEXT: v_mov_b32_e32 v9, v23 +; GFX940-NEXT: global_store_dwordx4 v24, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__4_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[10:25] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v24 +; GFX900-NEXT: v_mov_b32_e32 v11, v25 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[10:25] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v24 +; GFX90A-NEXT: v_mov_b32_e32 v11, v25 +; GFX90A-NEXT: global_store_dwordx4 v26, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v26, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[10:25] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v24 +; GFX940-NEXT: v_mov_b32_e32 v11, v25 +; GFX940-NEXT: global_store_dwordx4 v26, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__5_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v26 +; GFX900-NEXT: v_mov_b32_e32 v13, v27 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v26 +; GFX90A-NEXT: v_mov_b32_e32 v13, v27 +; GFX90A-NEXT: global_store_dwordx4 v28, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v28, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v26 +; GFX940-NEXT: v_mov_b32_e32 v13, v27 +; GFX940-NEXT: global_store_dwordx4 v28, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__6_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[14:29] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v28 +; GFX900-NEXT: v_mov_b32_e32 v15, v29 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[14:29] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, v28 +; GFX90A-NEXT: v_mov_b32_e32 v15, v29 +; GFX90A-NEXT: global_store_dwordx4 v30, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v30, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[14:29] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v28 +; GFX940-NEXT: v_mov_b32_e32 v15, v29 +; GFX940-NEXT: global_store_dwordx4 v30, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__7_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v16 +; GFX900-NEXT: v_mov_b32_e32 v13, v17 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v32, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v28, v14 +; GFX90A-NEXT: v_mov_b32_e32 v29, v15 +; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v32, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v28, v14 +; GFX940-NEXT: v_mov_b32_e32 v29, v15 +; GFX940-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__8_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__9_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__9_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__9_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__9_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__10_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__10_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v14 +; GFX900-NEXT: v_mov_b32_e32 v7, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__10_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v14 +; GFX90A-NEXT: v_mov_b32_e32 v7, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__10_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v14 +; GFX940-NEXT: v_mov_b32_e32 v7, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__11_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__11_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__11_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__11_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v14 +; GFX940-NEXT: v_mov_b32_e32 v9, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__12_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v14 +; GFX900-NEXT: v_mov_b32_e32 v11, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v14 +; GFX90A-NEXT: v_mov_b32_e32 v11, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v14 +; GFX940-NEXT: v_mov_b32_e32 v11, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__13_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__13_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__13_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__13_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2i64_v8i64__14_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2i64_v8i64__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2i64_v8i64__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2i64_v8i64__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=v"() + %vec1 = call <8 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + store <2 x i64> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_u() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s18 +; GFX940-NEXT: s_mov_b32 s5, s19 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s22 +; GFX900-NEXT: s_mov_b32 s5, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s22 +; GFX90A-NEXT: s_mov_b32 s5, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s18 +; GFX940-NEXT: s_mov_b32 s1, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s26 +; GFX900-NEXT: s_mov_b32 s5, s27 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s26 +; GFX90A-NEXT: s_mov_b32 s5, s27 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s22 +; GFX940-NEXT: s_mov_b32 s1, s23 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s26 +; GFX900-NEXT: s_mov_b32 s9, s27 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s26 +; GFX90A-NEXT: s_mov_b32 s9, s27 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s22 +; GFX940-NEXT: s_mov_b32 s5, s23 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s30 +; GFX900-NEXT: s_mov_b32 s5, s31 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s30 +; GFX90A-NEXT: s_mov_b32 s5, s31 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s26 +; GFX940-NEXT: s_mov_b32 s1, s27 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s30 +; GFX900-NEXT: s_mov_b32 s13, s31 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s30 +; GFX90A-NEXT: s_mov_b32 s13, s31 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s26 +; GFX940-NEXT: s_mov_b32 s9, s27 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s50 +; GFX900-NEXT: s_mov_b32 s5, s51 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s50 +; GFX90A-NEXT: s_mov_b32 s5, s51 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s30 +; GFX940-NEXT: s_mov_b32 s1, s31 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s50 +; GFX900-NEXT: s_mov_b32 s17, s51 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s50 +; GFX90A-NEXT: s_mov_b32 s17, s51 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s30 +; GFX940-NEXT: s_mov_b32 s13, s31 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__15_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__15_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__15_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__15_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_0() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s22, s4 +; GFX900-NEXT: s_mov_b32 s23, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[20:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s22, s4 +; GFX90A-NEXT: s_mov_b32 s23, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[20:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s18, s0 +; GFX940-NEXT: s_mov_b32 s19, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[16:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_1() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s22, s6 +; GFX900-NEXT: s_mov_b32 s23, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[20:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s22, s6 +; GFX90A-NEXT: s_mov_b32 s23, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[20:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s18, s2 +; GFX940-NEXT: s_mov_b32 s19, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[16:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s22, s8 +; GFX900-NEXT: s_mov_b32 s23, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[20:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s22, s8 +; GFX90A-NEXT: s_mov_b32 s23, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[20:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s18, s4 +; GFX940-NEXT: s_mov_b32 s19, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[16:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s22 +; GFX900-NEXT: s_mov_b32 s5, s23 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s22 +; GFX90A-NEXT: s_mov_b32 s5, s23 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s18 +; GFX940-NEXT: s_mov_b32 s1, s19 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_2() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s26, s8 +; GFX900-NEXT: s_mov_b32 s27, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[24:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s26, s8 +; GFX90A-NEXT: s_mov_b32 s27, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[24:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s22, s4 +; GFX940-NEXT: s_mov_b32 s23, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[20:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s22, s10 +; GFX900-NEXT: s_mov_b32 s23, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[20:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s22, s10 +; GFX90A-NEXT: s_mov_b32 s23, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[20:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s18, s6 +; GFX940-NEXT: s_mov_b32 s19, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[16:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s18 +; GFX940-NEXT: s_mov_b32 s5, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_3() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s26, s10 +; GFX900-NEXT: s_mov_b32 s27, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[24:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s26, s10 +; GFX90A-NEXT: s_mov_b32 s27, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[24:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s22, s6 +; GFX940-NEXT: s_mov_b32 s23, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[20:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s8 +; GFX940-NEXT: s_mov_b32 s11, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s12 +; GFX900-NEXT: s_mov_b32 s19, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s12 +; GFX90A-NEXT: s_mov_b32 s19, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s8 +; GFX940-NEXT: s_mov_b32 s15, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s22, s12 +; GFX900-NEXT: s_mov_b32 s23, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[20:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s22, s12 +; GFX90A-NEXT: s_mov_b32 s23, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[20:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s18, s8 +; GFX940-NEXT: s_mov_b32 s19, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[16:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s22 +; GFX900-NEXT: s_mov_b32 s5, s23 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s22 +; GFX90A-NEXT: s_mov_b32 s5, s23 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s18 +; GFX940-NEXT: s_mov_b32 s1, s19 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s26, s12 +; GFX900-NEXT: s_mov_b32 s27, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[24:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s26, s12 +; GFX90A-NEXT: s_mov_b32 s27, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[24:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s22, s8 +; GFX940-NEXT: s_mov_b32 s23, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[20:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s26 +; GFX900-NEXT: s_mov_b32 s5, s27 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s26 +; GFX90A-NEXT: s_mov_b32 s5, s27 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s22 +; GFX940-NEXT: s_mov_b32 s1, s23 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_4() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s30, s12 +; GFX900-NEXT: s_mov_b32 s31, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[28:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s30, s12 +; GFX90A-NEXT: s_mov_b32 s31, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[28:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s26, s8 +; GFX940-NEXT: s_mov_b32 s27, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[24:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s14 +; GFX900-NEXT: s_mov_b32 s19, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s14 +; GFX90A-NEXT: s_mov_b32 s19, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s10 +; GFX940-NEXT: s_mov_b32 s15, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s22, s14 +; GFX900-NEXT: s_mov_b32 s23, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[20:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s22, s14 +; GFX90A-NEXT: s_mov_b32 s23, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[20:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s18, s10 +; GFX940-NEXT: s_mov_b32 s19, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[16:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s22 +; GFX900-NEXT: s_mov_b32 s13, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s22 +; GFX90A-NEXT: s_mov_b32 s13, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s18 +; GFX940-NEXT: s_mov_b32 s9, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s26, s14 +; GFX900-NEXT: s_mov_b32 s27, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[24:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s26, s14 +; GFX90A-NEXT: s_mov_b32 s27, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[24:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s22, s10 +; GFX940-NEXT: s_mov_b32 s23, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[20:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s26 +; GFX900-NEXT: s_mov_b32 s13, s27 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s26 +; GFX90A-NEXT: s_mov_b32 s13, s27 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s22 +; GFX940-NEXT: s_mov_b32 s9, s23 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_5() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s30, s14 +; GFX900-NEXT: s_mov_b32 s31, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[28:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s30, s14 +; GFX90A-NEXT: s_mov_b32 s31, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[28:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s26, s10 +; GFX940-NEXT: s_mov_b32 s27, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[24:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s12 +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s16 +; GFX900-NEXT: s_mov_b32 s19, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s16 +; GFX90A-NEXT: s_mov_b32 s19, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s12 +; GFX940-NEXT: s_mov_b32 s15, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s38 +; GFX900-NEXT: s_mov_b32 s5, s39 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s38 +; GFX90A-NEXT: s_mov_b32 s5, s39 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s18 +; GFX940-NEXT: s_mov_b32 s1, s19 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s42, s16 +; GFX900-NEXT: s_mov_b32 s43, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[40:43] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s42, s16 +; GFX90A-NEXT: s_mov_b32 s43, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[40:43] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s22, s12 +; GFX940-NEXT: s_mov_b32 s23, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[20:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s42 +; GFX900-NEXT: s_mov_b32 s5, s43 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s42 +; GFX90A-NEXT: s_mov_b32 s5, s43 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s22 +; GFX940-NEXT: s_mov_b32 s1, s23 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s46, s16 +; GFX900-NEXT: s_mov_b32 s47, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[44:47] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s46, s16 +; GFX90A-NEXT: s_mov_b32 s47, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[44:47] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s26, s12 +; GFX940-NEXT: s_mov_b32 s27, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[24:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s46 +; GFX900-NEXT: s_mov_b32 s5, s47 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s46 +; GFX90A-NEXT: s_mov_b32 s5, s47 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s26 +; GFX940-NEXT: s_mov_b32 s1, s27 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_6() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s50, s16 +; GFX900-NEXT: s_mov_b32 s51, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[48:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s50, s16 +; GFX90A-NEXT: s_mov_b32 s51, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[48:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s30, s12 +; GFX940-NEXT: s_mov_b32 s31, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[28:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s14 +; GFX940-NEXT: s_mov_b32 s7, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s38 +; GFX900-NEXT: s_mov_b32 s17, s39 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s38 +; GFX90A-NEXT: s_mov_b32 s17, s39 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s18 +; GFX940-NEXT: s_mov_b32 s13, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s42, s18 +; GFX900-NEXT: s_mov_b32 s43, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[40:43] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s42, s18 +; GFX90A-NEXT: s_mov_b32 s43, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[40:43] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s22, s14 +; GFX940-NEXT: s_mov_b32 s23, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[20:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s42 +; GFX900-NEXT: s_mov_b32 s17, s43 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s42 +; GFX90A-NEXT: s_mov_b32 s17, s43 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s22 +; GFX940-NEXT: s_mov_b32 s13, s23 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s46, s18 +; GFX900-NEXT: s_mov_b32 s47, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[44:47] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s46, s18 +; GFX90A-NEXT: s_mov_b32 s47, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[44:47] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s26, s14 +; GFX940-NEXT: s_mov_b32 s27, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[24:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s46 +; GFX900-NEXT: s_mov_b32 s17, s47 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s46 +; GFX90A-NEXT: s_mov_b32 s17, s47 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s26 +; GFX940-NEXT: s_mov_b32 s13, s27 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_7() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s50, s18 +; GFX900-NEXT: s_mov_b32 s51, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[48:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s50, s18 +; GFX90A-NEXT: s_mov_b32 s51, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[48:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s30, s14 +; GFX940-NEXT: s_mov_b32 s31, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[28:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_8() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s38 +; GFX900-NEXT: s_mov_b32 s5, s39 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s38 +; GFX90A-NEXT: s_mov_b32 s5, s39 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s18 +; GFX940-NEXT: s_mov_b32 s1, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s22 +; GFX900-NEXT: s_mov_b32 s5, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s22 +; GFX90A-NEXT: s_mov_b32 s5, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s18 +; GFX940-NEXT: s_mov_b32 s1, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s22 +; GFX900-NEXT: s_mov_b32 s5, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s22 +; GFX90A-NEXT: s_mov_b32 s5, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s18 +; GFX940-NEXT: s_mov_b32 s1, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s38 +; GFX900-NEXT: s_mov_b32 s19, s39 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s38 +; GFX90A-NEXT: s_mov_b32 s19, s39 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s18 +; GFX940-NEXT: s_mov_b32 s15, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s22 +; GFX900-NEXT: s_mov_b32 s5, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s22 +; GFX90A-NEXT: s_mov_b32 s5, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s18 +; GFX940-NEXT: s_mov_b32 s1, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_9() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s20 +; GFX900-NEXT: s_mov_b32 s15, s21 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s20 +; GFX90A-NEXT: s_mov_b32 s15, s21 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s16 +; GFX940-NEXT: s_mov_b32 s11, s17 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s40 +; GFX900-NEXT: s_mov_b32 s19, s41 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s40 +; GFX90A-NEXT: s_mov_b32 s19, s41 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s20 +; GFX940-NEXT: s_mov_b32 s15, s21 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s40 +; GFX900-NEXT: s_mov_b32 s7, s41 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s40 +; GFX90A-NEXT: s_mov_b32 s7, s41 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s20 +; GFX940-NEXT: s_mov_b32 s3, s21 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_10() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s38 +; GFX900-NEXT: s_mov_b32 s9, s39 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s38 +; GFX90A-NEXT: s_mov_b32 s9, s39 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s18 +; GFX940-NEXT: s_mov_b32 s5, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s14 +; GFX940-NEXT: s_mov_b32 s7, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s18 +; GFX940-NEXT: s_mov_b32 s5, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s22 +; GFX900-NEXT: s_mov_b32 s15, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s22 +; GFX90A-NEXT: s_mov_b32 s15, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s18 +; GFX940-NEXT: s_mov_b32 s11, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s18 +; GFX940-NEXT: s_mov_b32 s5, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s42 +; GFX900-NEXT: s_mov_b32 s19, s43 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s42 +; GFX90A-NEXT: s_mov_b32 s19, s43 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s22 +; GFX940-NEXT: s_mov_b32 s15, s23 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s22 +; GFX900-NEXT: s_mov_b32 s9, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s22 +; GFX90A-NEXT: s_mov_b32 s9, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s18 +; GFX940-NEXT: s_mov_b32 s5, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_11() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s20 +; GFX900-NEXT: s_mov_b32 s11, s21 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s20 +; GFX90A-NEXT: s_mov_b32 s11, s21 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s16 +; GFX940-NEXT: s_mov_b32 s7, s17 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s24 +; GFX900-NEXT: s_mov_b32 s15, s25 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s24 +; GFX90A-NEXT: s_mov_b32 s15, s25 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s20 +; GFX940-NEXT: s_mov_b32 s11, s21 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s24 +; GFX900-NEXT: s_mov_b32 s7, s25 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s24 +; GFX90A-NEXT: s_mov_b32 s7, s25 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s20 +; GFX940-NEXT: s_mov_b32 s3, s21 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s44 +; GFX900-NEXT: s_mov_b32 s19, s45 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s44 +; GFX90A-NEXT: s_mov_b32 s19, s45 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s24 +; GFX940-NEXT: s_mov_b32 s15, s25 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s44 +; GFX900-NEXT: s_mov_b32 s7, s45 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s44 +; GFX90A-NEXT: s_mov_b32 s7, s45 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s24 +; GFX940-NEXT: s_mov_b32 s3, s25 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s8 +; GFX940-NEXT: s_mov_b32 s11, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_12() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s12 +; GFX900-NEXT: s_mov_b32 s19, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s12 +; GFX90A-NEXT: s_mov_b32 s19, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s8 +; GFX940-NEXT: s_mov_b32 s15, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s38 +; GFX900-NEXT: s_mov_b32 s13, s39 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s38 +; GFX90A-NEXT: s_mov_b32 s13, s39 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s18 +; GFX940-NEXT: s_mov_b32 s9, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s22 +; GFX900-NEXT: s_mov_b32 s11, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s22 +; GFX90A-NEXT: s_mov_b32 s11, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s18 +; GFX940-NEXT: s_mov_b32 s7, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s22 +; GFX900-NEXT: s_mov_b32 s13, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s22 +; GFX90A-NEXT: s_mov_b32 s13, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s18 +; GFX940-NEXT: s_mov_b32 s9, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s26 +; GFX900-NEXT: s_mov_b32 s15, s27 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s26 +; GFX90A-NEXT: s_mov_b32 s15, s27 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s22 +; GFX940-NEXT: s_mov_b32 s11, s23 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s22 +; GFX900-NEXT: s_mov_b32 s13, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s22 +; GFX90A-NEXT: s_mov_b32 s13, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s18 +; GFX940-NEXT: s_mov_b32 s9, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s46 +; GFX900-NEXT: s_mov_b32 s19, s47 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s46 +; GFX90A-NEXT: s_mov_b32 s19, s47 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s26 +; GFX940-NEXT: s_mov_b32 s15, s27 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s22 +; GFX900-NEXT: s_mov_b32 s13, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s22 +; GFX90A-NEXT: s_mov_b32 s13, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s18 +; GFX940-NEXT: s_mov_b32 s9, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_13() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s14 +; GFX900-NEXT: s_mov_b32 s19, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s14 +; GFX90A-NEXT: s_mov_b32 s19, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s10 +; GFX940-NEXT: s_mov_b32 s15, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s24 +; GFX900-NEXT: s_mov_b32 s11, s25 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s24 +; GFX90A-NEXT: s_mov_b32 s11, s25 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s20 +; GFX940-NEXT: s_mov_b32 s7, s21 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s24 +; GFX900-NEXT: s_mov_b32 s7, s25 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s24 +; GFX90A-NEXT: s_mov_b32 s7, s25 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s20 +; GFX940-NEXT: s_mov_b32 s3, s21 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s28 +; GFX900-NEXT: s_mov_b32 s15, s29 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s28 +; GFX90A-NEXT: s_mov_b32 s15, s29 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s24 +; GFX940-NEXT: s_mov_b32 s11, s25 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s28 +; GFX900-NEXT: s_mov_b32 s7, s29 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s28 +; GFX90A-NEXT: s_mov_b32 s7, s29 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s24 +; GFX940-NEXT: s_mov_b32 s3, s25 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s48 +; GFX900-NEXT: s_mov_b32 s19, s49 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s48 +; GFX90A-NEXT: s_mov_b32 s19, s49 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s28 +; GFX940-NEXT: s_mov_b32 s15, s29 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s48 +; GFX900-NEXT: s_mov_b32 s7, s49 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s48 +; GFX90A-NEXT: s_mov_b32 s7, s49 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s28 +; GFX940-NEXT: s_mov_b32 s3, s29 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s12 +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_14() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s16 +; GFX900-NEXT: s_mov_b32 s19, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s16 +; GFX90A-NEXT: s_mov_b32 s19, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s12 +; GFX940-NEXT: s_mov_b32 s15, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__u_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__0_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s22 +; GFX900-NEXT: s_mov_b32 s7, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s22 +; GFX90A-NEXT: s_mov_b32 s7, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s18 +; GFX940-NEXT: s_mov_b32 s3, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__1_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s38 +; GFX900-NEXT: s_mov_b32 s17, s39 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s38 +; GFX90A-NEXT: s_mov_b32 s17, s39 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s18 +; GFX940-NEXT: s_mov_b32 s13, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__2_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s26 +; GFX900-NEXT: s_mov_b32 s11, s27 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s26 +; GFX90A-NEXT: s_mov_b32 s11, s27 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s22 +; GFX940-NEXT: s_mov_b32 s7, s23 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__3_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s22 +; GFX900-NEXT: s_mov_b32 s17, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s22 +; GFX90A-NEXT: s_mov_b32 s17, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s18 +; GFX940-NEXT: s_mov_b32 s13, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__4_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s30, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_writelane_b32 v0, s31, 1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:31] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s30 +; GFX900-NEXT: s_mov_b32 s15, s31 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s31, v0, 1 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s30 +; GFX90A-NEXT: s_mov_b32 s15, s31 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:27] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s26 +; GFX940-NEXT: s_mov_b32 s11, s27 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__5_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:27] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s22 +; GFX900-NEXT: s_mov_b32 s17, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:27] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s22 +; GFX90A-NEXT: s_mov_b32 s17, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:23] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s18 +; GFX940-NEXT: s_mov_b32 s13, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__6_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_writelane_b32 v0, s36, 0 +; GFX900-NEXT: v_writelane_b32 v0, s37, 1 +; GFX900-NEXT: v_writelane_b32 v0, s38, 2 +; GFX900-NEXT: v_writelane_b32 v0, s39, 3 +; GFX900-NEXT: v_writelane_b32 v0, s40, 4 +; GFX900-NEXT: v_writelane_b32 v0, s41, 5 +; GFX900-NEXT: v_writelane_b32 v0, s42, 6 +; GFX900-NEXT: v_writelane_b32 v0, s43, 7 +; GFX900-NEXT: v_writelane_b32 v0, s44, 8 +; GFX900-NEXT: v_writelane_b32 v0, s45, 9 +; GFX900-NEXT: v_writelane_b32 v0, s46, 10 +; GFX900-NEXT: v_writelane_b32 v0, s47, 11 +; GFX900-NEXT: v_writelane_b32 v0, s48, 12 +; GFX900-NEXT: v_writelane_b32 v0, s49, 13 +; GFX900-NEXT: v_writelane_b32 v0, s50, 14 +; GFX900-NEXT: v_writelane_b32 v0, s51, 15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[36:51] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s50 +; GFX900-NEXT: s_mov_b32 s19, s51 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s51, v0, 15 +; GFX900-NEXT: v_readlane_b32 s50, v0, 14 +; GFX900-NEXT: v_readlane_b32 s49, v0, 13 +; GFX900-NEXT: v_readlane_b32 s48, v0, 12 +; GFX900-NEXT: v_readlane_b32 s47, v0, 11 +; GFX900-NEXT: v_readlane_b32 s46, v0, 10 +; GFX900-NEXT: v_readlane_b32 s45, v0, 9 +; GFX900-NEXT: v_readlane_b32 s44, v0, 8 +; GFX900-NEXT: v_readlane_b32 s43, v0, 7 +; GFX900-NEXT: v_readlane_b32 s42, v0, 6 +; GFX900-NEXT: v_readlane_b32 s41, v0, 5 +; GFX900-NEXT: v_readlane_b32 s40, v0, 4 +; GFX900-NEXT: v_readlane_b32 s39, v0, 3 +; GFX900-NEXT: v_readlane_b32 s38, v0, 2 +; GFX900-NEXT: v_readlane_b32 s37, v0, 1 +; GFX900-NEXT: v_readlane_b32 s36, v0, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: v_writelane_b32 v0, s36, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s37, 1 +; GFX90A-NEXT: v_writelane_b32 v0, s38, 2 +; GFX90A-NEXT: v_writelane_b32 v0, s39, 3 +; GFX90A-NEXT: v_writelane_b32 v0, s40, 4 +; GFX90A-NEXT: v_writelane_b32 v0, s41, 5 +; GFX90A-NEXT: v_writelane_b32 v0, s42, 6 +; GFX90A-NEXT: v_writelane_b32 v0, s43, 7 +; GFX90A-NEXT: v_writelane_b32 v0, s44, 8 +; GFX90A-NEXT: v_writelane_b32 v0, s45, 9 +; GFX90A-NEXT: v_writelane_b32 v0, s46, 10 +; GFX90A-NEXT: v_writelane_b32 v0, s47, 11 +; GFX90A-NEXT: v_writelane_b32 v0, s48, 12 +; GFX90A-NEXT: v_writelane_b32 v0, s49, 13 +; GFX90A-NEXT: v_writelane_b32 v0, s50, 14 +; GFX90A-NEXT: v_writelane_b32 v0, s51, 15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[36:51] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s50 +; GFX90A-NEXT: s_mov_b32 s19, s51 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s51, v0, 15 +; GFX90A-NEXT: v_readlane_b32 s50, v0, 14 +; GFX90A-NEXT: v_readlane_b32 s49, v0, 13 +; GFX90A-NEXT: v_readlane_b32 s48, v0, 12 +; GFX90A-NEXT: v_readlane_b32 s47, v0, 11 +; GFX90A-NEXT: v_readlane_b32 s46, v0, 10 +; GFX90A-NEXT: v_readlane_b32 s45, v0, 9 +; GFX90A-NEXT: v_readlane_b32 s44, v0, 8 +; GFX90A-NEXT: v_readlane_b32 s43, v0, 7 +; GFX90A-NEXT: v_readlane_b32 s42, v0, 6 +; GFX90A-NEXT: v_readlane_b32 s41, v0, 5 +; GFX90A-NEXT: v_readlane_b32 s40, v0, 4 +; GFX90A-NEXT: v_readlane_b32 s39, v0, 3 +; GFX90A-NEXT: v_readlane_b32 s38, v0, 2 +; GFX90A-NEXT: v_readlane_b32 s37, v0, 1 +; GFX90A-NEXT: v_readlane_b32 s36, v0, 0 +; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: v_writelane_b32 v0, s30, 0 +; GFX940-NEXT: v_writelane_b32 v0, s31, 1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[16:31] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s30 +; GFX940-NEXT: s_mov_b32 s15, s31 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_readlane_b32 s31, v0, 1 +; GFX940-NEXT: v_readlane_b32 s30, v0, 0 +; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX940-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; GFX940-NEXT: s_mov_b64 exec, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__7_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s22 +; GFX900-NEXT: s_mov_b32 s17, s23 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s22 +; GFX90A-NEXT: s_mov_b32 s17, s23 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s18 +; GFX940-NEXT: s_mov_b32 s13, s19 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__8_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__9_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__9_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__9_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__9_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__10_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__10_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__10_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__10_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s14 +; GFX940-NEXT: s_mov_b32 s7, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__11_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__11_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__11_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__11_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__12_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__13_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__13_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__13_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__13_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} + +define void @s_shuffle_v2i64_v8i64__14_15() { +; GFX900-LABEL: s_shuffle_v2i64_v8i64__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2i64_v8i64__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2i64_v8i64__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x i64> asm "; def $0", "=s"() + %vec1 = call <8 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <8 x i64> %vec0, <8 x i64> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x i64> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll new file mode 100644 index 0000000000000..11aff0e14c829 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll @@ -0,0 +1,2104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2p0_v2p0__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p0_v2p0__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> poison + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__2_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p0_v2p0__2_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> zeroinitializer + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__u_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p0_v2p0__u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p0_v2p0__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v2p0__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v2p0__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v2p0__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v2p0__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v2p0_v2p0__u_u() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__0_u() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__1_u() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__2_u() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__3_u() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__3_0() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__3_1() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__3_2() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__3_3() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__u_0() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__0_0() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__1_0() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__2_0() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__u_1() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__0_1() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__1_1() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__2_1() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__u_2() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__0_2() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__1_2() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__2_2() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__u_3() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__0_3() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__1_3() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v2p0__2_3() { +; GFX900-LABEL: s_shuffle_v2p0_v2p0__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v2p0__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v2p0__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll new file mode 100644 index 0000000000000..a110ccffdf83b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll @@ -0,0 +1,4469 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2p0_v3p0__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p0_v3p0__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> poison + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__3_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p0_v3p0__3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> zeroinitializer + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__u_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p0_v3p0__u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p0_v3p0__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v3p0__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v3p0__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v3p0__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v3p0__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v2p0_v3p0__u_u() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__0_u() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__1_u() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__2_u() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__3_u() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__4_u() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__5_u() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__5_0() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__5_1() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__5_2() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__5_3() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__5_4() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__5_5() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__u_0() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__0_0() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__1_0() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__2_0() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__3_0() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__4_0() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__u_1() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__0_1() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__1_1() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__2_1() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__3_1() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__4_1() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__u_2() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__0_2() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__1_2() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__2_2() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__3_2() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__4_2() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__u_3() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__0_3() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__1_3() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__2_3() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__3_3() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__4_3() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__u_4() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__0_4() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__1_4() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__2_4() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__3_4() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__4_4() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__u_5() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__0_5() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__1_5() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__2_5() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__3_5() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v3p0__4_5() { +; GFX900-LABEL: s_shuffle_v2p0_v3p0__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v3p0__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v3p0__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll new file mode 100644 index 0000000000000..3b6379d59cae4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll @@ -0,0 +1,7547 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2p0_v4p0__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p0_v4p0__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> poison + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__4_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p0_v4p0__4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> zeroinitializer + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v2 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v4 +; GFX940-NEXT: v_mov_b32_e32 v13, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v6 +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, v6 +; GFX90A-NEXT: v_mov_b32_e32 v15, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v6 +; GFX940-NEXT: v_mov_b32_e32 v15, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__u_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p0_v4p0__u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p0_v4p0__4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v6 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v2p0_v4p0__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p0_v4p0__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p0_v4p0__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p0_v4p0__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + store <2 x ptr> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v2p0_v4p0__u_u() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__0_u() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__1_u() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__2_u() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__3_u() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__4_u() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__5_u() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__6_u() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__7_u() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__7_0() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__7_1() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__7_2() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__7_3() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__7_4() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__7_5() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__7_6() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__7_7() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__u_0() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__0_0() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__1_0() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__2_0() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__3_0() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__4_0() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__5_0() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__6_0() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__u_1() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__0_1() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__1_1() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__2_1() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__3_1() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__4_1() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__5_1() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__6_1() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__u_2() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__0_2() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__1_2() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__2_2() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__3_2() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__4_2() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__5_2() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__6_2() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__u_3() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__0_3() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__1_3() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__2_3() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__3_3() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__4_3() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__5_3() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__6_3() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__u_4() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__0_4() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__1_4() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__2_4() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__3_4() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__4_4() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__5_4() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__6_4() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__u_5() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__0_5() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__1_5() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__2_5() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__3_5() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__4_5() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__5_5() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__6_5() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__u_6() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__0_6() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__1_6() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__2_6() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__3_6() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__4_6() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__5_6() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__6_6() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__u_7() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__0_7() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__1_7() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__2_7() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s14 +; GFX940-NEXT: s_mov_b32 s7, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__3_7() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__4_7() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__5_7() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v2p0_v4p0__6_7() { +; GFX900-LABEL: s_shuffle_v2p0_v4p0__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p0_v4p0__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p0_v4p0__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<2 x ptr> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll new file mode 100644 index 0000000000000..475c7324fb77c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll @@ -0,0 +1,1875 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2p3_v2p3__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v2p3__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> poison + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__2_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v2p3__2_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> zeroinitializer + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__u_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v2p3__u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v2p3__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v2p3__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v2p3__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v2p3__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v2p3__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v2p3_v2p3__u_u() { +; GFX9-LABEL: s_shuffle_v2p3_v2p3__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__0_u() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__1_u() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__2_u() { +; GFX9-LABEL: s_shuffle_v2p3_v2p3__2_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__3_u() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__3_0() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__3_1() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__3_2() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__3_3() { +; GFX9-LABEL: s_shuffle_v2p3_v2p3__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__u_0() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__0_0() { +; GFX9-LABEL: s_shuffle_v2p3_v2p3__0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__1_0() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__2_0() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__u_1() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__0_1() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__1_1() { +; GFX9-LABEL: s_shuffle_v2p3_v2p3__1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__2_1() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__u_2() { +; GFX9-LABEL: s_shuffle_v2p3_v2p3__u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__0_2() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__1_2() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__2_2() { +; GFX9-LABEL: s_shuffle_v2p3_v2p3__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__u_3() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__0_3() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__1_3() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v2p3__2_3() { +; GFX900-LABEL: s_shuffle_v2p3_v2p3__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v2p3__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v2p3__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll new file mode 100644 index 0000000000000..821b51451bd17 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll @@ -0,0 +1,4236 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2p3_v3p3__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v3p3__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> poison + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__3_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v3p3__3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> zeroinitializer + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__u_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v3p3__u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v3p3__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v5, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v5, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v3p3__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v3p3__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v3p3__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v3p3__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v2p3_v3p3__u_u() { +; GFX9-LABEL: s_shuffle_v2p3_v3p3__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__0_u() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__1_u() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__2_u() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__3_u() { +; GFX9-LABEL: s_shuffle_v2p3_v3p3__3_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__4_u() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__5_u() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__5_0() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__5_1() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__5_2() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__5_3() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__5_4() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__5_5() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__u_0() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__0_0() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__1_0() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__2_0() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__3_0() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__4_0() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__u_1() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__0_1() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__1_1() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__2_1() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__3_1() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__4_1() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__u_2() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__0_2() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__1_2() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__2_2() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__3_2() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__4_2() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__u_3() { +; GFX9-LABEL: s_shuffle_v2p3_v3p3__u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__0_3() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__1_3() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__2_3() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__3_3() { +; GFX9-LABEL: s_shuffle_v2p3_v3p3__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__4_3() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__u_4() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__0_4() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__1_4() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__2_4() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__3_4() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__4_4() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__u_5() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__0_5() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__1_5() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__2_5() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__3_5() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v3p3__4_5() { +; GFX900-LABEL: s_shuffle_v2p3_v3p3__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v3p3__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v3p3__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll new file mode 100644 index 0000000000000..b6bfd0c98c086 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll @@ -0,0 +1,6929 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2p3_v4p3__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v4p3__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> poison + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__4_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v4p3__4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx2 v7, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> zeroinitializer + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v5, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v7, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx2 v7, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__u_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v4p3__u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v4p3__4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx2 v7, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v4p3__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v4p3__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v4p3__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v4p3__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v2p3_v4p3__u_u() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__0_u() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__1_u() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__2_u() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__3_u() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__4_u() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__4_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__5_u() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__6_u() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__7_u() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__7_0() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__7_1() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__7_2() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__7_3() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__7_4() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__7_5() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__7_6() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__7_7() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__u_0() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__0_0() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__1_0() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__2_0() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__3_0() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__4_0() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__5_0() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__6_0() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__u_1() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__0_1() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__1_1() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__2_1() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__2_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__3_1() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__4_1() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__5_1() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__6_1() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__u_2() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__0_2() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__1_2() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__2_2() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__3_2() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__4_2() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__5_2() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__6_2() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__u_3() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__0_3() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__1_3() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__1_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__2_3() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__3_3() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__4_3() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__5_3() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__6_3() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__u_4() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__u_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__0_4() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__1_4() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__2_4() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__3_4() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__4_4() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__5_4() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__6_4() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__6_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__u_5() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__0_5() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__1_5() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__2_5() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__3_5() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__4_5() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__5_5() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__6_5() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__6_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__u_6() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__0_6() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__1_6() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__2_6() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__3_6() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__4_6() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__5_6() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__6_6() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__u_7() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__0_7() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__1_7() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__2_7() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__3_7() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__4_7() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__5_7() { +; GFX9-LABEL: s_shuffle_v2p3_v4p3__5_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v4p3__6_7() { +; GFX900-LABEL: s_shuffle_v2p3_v4p3__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v4p3__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v4p3__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll new file mode 100644 index 0000000000000..c3d04867d3440 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll @@ -0,0 +1,25924 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v2p3_v8p3__u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v8p3__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> poison + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v8p3__8_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v8 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v9 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v9 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v10 +; GFX900-NEXT: global_store_dwordx2 v11, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v11 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v11 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v12 +; GFX900-NEXT: global_store_dwordx2 v13, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v13 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v13 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v14 +; GFX900-NEXT: global_store_dwordx2 v15, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v15 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v15 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v15 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__15_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__15_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__15_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__15_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> zeroinitializer + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: global_store_dwordx2 v9, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v0 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v0 +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v2 +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: global_store_dwordx2 v11, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v2 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v2 +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v9 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v9 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v4 +; GFX940-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v4 +; GFX940-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v11 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: global_store_dwordx2 v13, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v4 +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v13, v4 +; GFX940-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[7:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v9 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v9 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v11 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v11 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v13, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v9, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v9 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v11, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v6 +; GFX940-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v11, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v11 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v13, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v13, v6 +; GFX940-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v13, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[12:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v13 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v14, v6 +; GFX900-NEXT: global_store_dwordx2 v15, v[13:14], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v15, v6 +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v15, v6 +; GFX940-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v9 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v9 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v11 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v11 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[13:14], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v13 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v13 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v15, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v15, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v8p3__u_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_8(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v2p3_v8p3__8_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_8(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v8 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_9(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v8 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v8 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v10 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v10 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[9:10], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_10(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v8 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v9 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v10 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_11(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v8 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v8 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v10 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v10 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v12 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v12 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[11:12], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_12(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v4 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v4 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v4 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v8 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v9 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v10 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v11 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v12 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_13(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v10 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v10 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v12 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v12 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v14 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v14 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[13:14], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v14 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v14 +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_14(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__u_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__0_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:8] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v8 +; GFX900-NEXT: global_store_dwordx2 v9, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__1_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v1 +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__2_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v11, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v10 +; GFX900-NEXT: global_store_dwordx2 v11, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__3_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v3 +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__4_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:12] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v13, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v12 +; GFX900-NEXT: global_store_dwordx2 v13, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v13 +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__5_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v5 +; GFX940-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__6_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[7:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v15, 0 +; GFX900-NEXT: v_mov_b32_e32 v7, v14 +; GFX900-NEXT: global_store_dwordx2 v15, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v7, v15 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__7_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v14, v7 +; GFX940-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__8_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__9_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__9_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__9_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__9_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__10_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__10_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__10_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__10_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__11_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__11_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[3:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__11_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__11_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__12_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__13_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__13_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[5:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__13_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__13_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v2p3_v8p3__14_15(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v2p3_v8p3__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v2p3_v8p3__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v2p3_v8p3__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + store <2 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_u() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_u() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__8_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__10_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__10_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__10_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__14_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__14_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__14_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_u() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s11 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s15 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s15 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s11 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s11 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_3() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s19 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s19 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s15 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s19 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s19 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s15 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s15 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_7() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_8() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_11() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__15_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__15_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__15_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__15_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__15_15() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__15_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_0() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__2_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_0() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__6_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__10_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__10_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__10_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s9 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_0() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__14_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__14_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__14_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_1() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__2_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_1() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__6_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__10_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__10_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__10_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s13 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s13 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s9 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_1() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__14_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__14_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__14_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_2() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_2() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__6_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s6 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__10_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__10_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__10_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s13, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s13, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s9, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s9 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_2() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__14_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__14_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__14_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_3() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_3() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_3() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__1_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_3() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_3() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_3() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_3() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__5_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_3() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__6_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_3() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__7_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_3() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_3() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_3() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__10_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__10_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__10_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_3() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_3() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_3() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_3() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__14_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__14_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__14_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_4() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__2_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s12 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_4() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__6_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s9 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__10_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__10_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__10_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s11 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s17, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s17, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s13, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s17 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s17 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s13 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_4() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__14_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__14_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__14_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s12 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_5() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__2_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_5() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__6_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s13 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s13 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s9 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__10_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__10_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__10_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s15 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s15 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s11 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s17 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s17 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s13 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_5() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__14_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__14_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__14_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_6() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__2_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s14 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_6() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s9 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__10_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__10_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__10_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s11 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s17, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s17, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s13, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s13 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_6() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__14_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__14_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__14_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_7() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_7() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_7() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__1_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_7() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__2_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_7() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__3_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_7() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_7() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__5_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_7() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_7() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__7_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_7() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_7() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_7() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__10_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__10_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__10_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_7() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_7() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_7() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_7() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__14_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__14_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__14_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_8() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__u_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_8() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_8() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_8() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__2_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__2_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__2_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_8() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_8() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_8() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_8() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__6_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__6_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__6_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_8() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_8() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__8_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> poison, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_8() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_8() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__10_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_8() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_8() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_8() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_8: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_8: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_8: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_8() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__14_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s4 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__2_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__2_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__2_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__6_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__6_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__6_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_9() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__10_9: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_9() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_9: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_9: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_9: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_9() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__14_9: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__2_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__2_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__2_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s14 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s14 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s10 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__6_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__6_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__6_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_10() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__10_10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_10() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_10: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_10: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_10: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_10() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__14_10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s6 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_11() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_11() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_11() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_11() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__2_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__2_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__2_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_11() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_11() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_11() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_11() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__6_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__6_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__6_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_11() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_11() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_11() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__9_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_11() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__10_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__10_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__10_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_11() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__11_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_11() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_11: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_11: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_11: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_11() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__13_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_11() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__14_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s12 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s12 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s8 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__2_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__2_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__2_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s16 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s16 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s12 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s12 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__6_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__6_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__6_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s12 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_12() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__10_12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s12 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s8 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s8 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s4 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_12() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_12: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_12: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_12: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_12() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__14_12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s8 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__2_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__2_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__2_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s13 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__6_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__6_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__6_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[16:17] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[12:13] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_13() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__10_13: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s13 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_13() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_13: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_13: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_13: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_13() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__14_13: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__2_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__2_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__2_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s18 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s18 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s14 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__6_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__6_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__6_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s18 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s18 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s7 +; GFX940-NEXT: s_mov_b32 s11, s14 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__9_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__9_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__9_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_14() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__10_14: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s14 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__11_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__11_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__11_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_14() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__13_14: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: s_mov_b32 s11, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__13_14: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: s_mov_b32 s11, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__13_14: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_14() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__14_14: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__u_15() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__u_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__u_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__u_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__0_15() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__0_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__0_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__0_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__1_15() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__1_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__1_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__1_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__2_15() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__2_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__2_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__2_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__3_15() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__3_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__3_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__3_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__4_15() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__4_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__4_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__4_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__5_15() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__5_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__5_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__5_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__6_15() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__6_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__6_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__6_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__7_15() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__7_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__7_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__7_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__8_15() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__8_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__8_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__8_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[0:1] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__9_15() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__9_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s5 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__10_15() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__10_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[8:15] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s11, s15 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__11_15() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__11_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s7 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__12_15() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__12_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__12_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__12_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__13_15() { +; GFX9-LABEL: s_shuffle_v2p3_v8p3__13_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[4:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s9 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v2p3_v8p3__14_15() { +; GFX900-LABEL: s_shuffle_v2p3_v8p3__14_15: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v2p3_v8p3__14_15: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v2p3_v8p3__14_15: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <8 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <8 x ptr addrspace(3)> %vec0, <8 x ptr addrspace(3)> %vec1, <2 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<2 x ptr addrspace(3)> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll new file mode 100644 index 0000000000000..248ebe3464417 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v2bf16.ll @@ -0,0 +1,4042 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3bf16_v2bf16__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3bf16_v2bf16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> poison + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3bf16_v2bf16__2_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v2, v2, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v2, v2, s2 +; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> zeroinitializer + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v2, v1, v2, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v1, v2, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v2, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v2, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v2, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3bf16_v2bf16__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3bf16_v2bf16__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: global_store_short v0, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: global_store_short v0, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: global_store_short v0, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: global_store_short v0, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v2bf16__3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v2bf16__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v2bf16__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v2bf16__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__u_u_u() { +; GFX9-LABEL: s_shuffle_v3bf16_v2bf16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> poison + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__0_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__1_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__2_u_u() { +; GFX9-LABEL: s_shuffle_v3bf16_v2bf16__2_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_0_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_1_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_2_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_3_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_3_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_3_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_3_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__u_0_0() { +; GFX9-LABEL: s_shuffle_v3bf16_v2bf16__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__0_0_0() { +; GFX9-LABEL: s_shuffle_v3bf16_v2bf16__0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> zeroinitializer + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__1_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__2_0_0() { +; GFX9-LABEL: s_shuffle_v3bf16_v2bf16__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_u_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_1_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_2_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__u_1_1() { +; GFX9-LABEL: s_shuffle_v3bf16_v2bf16__u_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__0_1_1() { +; GFX9-LABEL: s_shuffle_v3bf16_v2bf16__0_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__1_1_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__2_1_1() { +; GFX9-LABEL: s_shuffle_v3bf16_v2bf16__2_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_1_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_u_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_0_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_2_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__u_2_2() { +; GFX9-LABEL: s_shuffle_v3bf16_v2bf16__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__0_2_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__1_2_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__2_2_2() { +; GFX9-LABEL: s_shuffle_v3bf16_v2bf16__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_2_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_u_2() { +; GFX9-LABEL: s_shuffle_v3bf16_v2bf16__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_0_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_1_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__u_3_3() { +; GFX9-LABEL: s_shuffle_v3bf16_v2bf16__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__0_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__1_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__2_3_3() { +; GFX9-LABEL: s_shuffle_v3bf16_v2bf16__2_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_u_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_0_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_1_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v2bf16__3_2_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v2bf16__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v2bf16__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v2bf16__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll new file mode 100644 index 0000000000000..4da6b981889c2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v3bf16.ll @@ -0,0 +1,9009 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3bf16_v3bf16__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3bf16_v3bf16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> poison + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3bf16_v3bf16__3_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> zeroinitializer + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v3, v2, s[16:17] +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v2 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3bf16_v3bf16__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3bf16_v3bf16__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v3bf16__5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v3bf16__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v3bf16__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v3bf16__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__u_u_u() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> poison + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__0_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__1_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__2_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__3_u_u() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__3_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__4_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_0_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_1_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_2_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_3_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_4_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_5_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_5_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_5_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_5_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_5_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_5_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_5_5() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__u_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__0_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> zeroinitializer + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__1_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__2_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__3_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__4_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_u_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_1_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_2_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_3_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_4_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__u_1_1() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__u_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__0_1_1() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__0_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__1_1_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__2_1_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__3_1_1() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__3_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__4_1_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_1_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_u_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_0_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_2_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_3_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_4_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__u_2_2() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__0_2_2() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__0_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__1_2_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__2_2_2() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__3_2_2() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__4_2_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_2_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_u_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_0_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_1_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_3_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_4_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__u_3_3() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__0_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__1_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__2_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__3_3_3() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <3 x i32> + %2 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %2) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__4_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_u_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_0_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_1_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_2_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_4_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__u_4_4() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__0_4_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__1_4_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__2_4_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__3_4_4() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__3_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__4_4_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_4_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_u_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_0_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_1_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_2_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_3_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__u_5_5() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__u_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__0_5_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__1_5_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__2_5_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__3_5_5() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__3_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__4_5_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_u_5() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__5_u_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_0_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_1_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_2_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_3_5() { +; GFX9-LABEL: s_shuffle_v3bf16_v3bf16__5_3_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} + +define void @s_shuffle_v3bf16_v3bf16__5_4_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v3bf16__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v3bf16__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v3bf16__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <3 x i32> + %3 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %3) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v4bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v4bf16.ll new file mode 100644 index 0000000000000..937d61c7aaed9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3bf16.v4bf16.ll @@ -0,0 +1,15446 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3bf16_v4bf16__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3bf16_v4bf16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> poison + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3bf16_v4bf16__4_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> zeroinitializer + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v2, v2, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v2, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3bf16_v4bf16__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3bf16_v4bf16__4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v3, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v3, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3bf16_v4bf16__7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3bf16_v4bf16__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3bf16_v4bf16__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3bf16_v4bf16__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + store <3 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__u_u_u() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> poison + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__0_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__1_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__2_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__3_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__4_u_u() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__4_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__5_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__6_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_u_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_0_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_1_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_2_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_3_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_4_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_5_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_6_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_7_u() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_7_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_7_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_7_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_7_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_7_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_7_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_7_6() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_7_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__u_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__0_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> zeroinitializer + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__1_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__2_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__3_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__4_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__5_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__6_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_0_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_u_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_1_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_2_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_3_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_4_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_5_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_6_0() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__u_1_1() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__u_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__0_1_1() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__0_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__1_1_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__2_1_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__3_1_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__4_1_1() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__4_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__5_1_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__6_1_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_1_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_u_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_0_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_2_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_3_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_4_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_5_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_6_1() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__u_2_2() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__0_2_2() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__0_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__1_2_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__2_2_2() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__3_2_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__4_2_2() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__4_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__5_2_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__6_2_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_2_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_u_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_0_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_1_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_3_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_4_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_5_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_6_2() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__u_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__0_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__1_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__2_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__3_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__4_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__5_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__6_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_3_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_u_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_0_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_1_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_2_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_4_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_5_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_6_3() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__u_4_4() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__0_4_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__1_4_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__2_4_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__3_4_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__4_4_4() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__5_4_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__6_4_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_4_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_u_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_0_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_1_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_2_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_3_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_5_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_6_4() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__u_5_5() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__u_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__0_5_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__1_5_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__2_5_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__3_5_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__4_5_5() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__4_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__5_5_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__6_5_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_5_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_u_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_0_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_1_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_2_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_3_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_4_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_6_5() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__u_6_6() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__u_6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__0_6_6() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__1_6_6() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__2_6_6() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__3_6_6() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__4_6_6() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__4_6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__5_6_6() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__6_6_6() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__6_6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_6_6() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_u_6() { +; GFX9-LABEL: s_shuffle_v3bf16_v4bf16__7_u_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_0_6() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_1_6() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_2_6() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_3_6() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_4_6() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_5_6() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__u_7_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__0_7_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__1_7_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__2_7_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__3_7_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__4_7_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__5_7_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__6_7_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_u_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_0_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_1_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_2_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_3_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_4_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_5_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} + +define void @s_shuffle_v3bf16_v4bf16__7_6_7() { +; GFX900-LABEL: s_shuffle_v3bf16_v4bf16__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3bf16_v4bf16__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3bf16_v4bf16__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <3 x i32> + %1 = shufflevector <3 x bfloat> %shuf, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %1) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll new file mode 100644 index 0000000000000..f36d6348adf2b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v2f16.ll @@ -0,0 +1,4042 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3f16_v2f16__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f16_v2f16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> poison + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f16_v2f16__2_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v2, v2, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v2, v2, s2 +; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> zeroinitializer + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v2, v1, v2, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v1, v2, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v2, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v2, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v2, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f16_v2f16__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f16_v2f16__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: global_store_short v0, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: global_store_short v0, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: global_store_short v0, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: global_store_short v0, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v2f16__3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v2f16__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v2f16__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v2f16__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v3f16_v2f16__u_u_u() { +; GFX9-LABEL: s_shuffle_v3f16_v2f16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> poison + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__0_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__1_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__2_u_u() { +; GFX9-LABEL: s_shuffle_v3f16_v2f16__2_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_0_u() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_1_u() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_2_u() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_3_u() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_3_0() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_3_1() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_3_2() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__u_0_0() { +; GFX9-LABEL: s_shuffle_v3f16_v2f16__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__0_0_0() { +; GFX9-LABEL: s_shuffle_v3f16_v2f16__0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> zeroinitializer + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__1_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__2_0_0() { +; GFX9-LABEL: s_shuffle_v3f16_v2f16__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_u_0() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_1_0() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_2_0() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__u_1_1() { +; GFX9-LABEL: s_shuffle_v3f16_v2f16__u_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__0_1_1() { +; GFX9-LABEL: s_shuffle_v3f16_v2f16__0_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__1_1_1() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__2_1_1() { +; GFX9-LABEL: s_shuffle_v3f16_v2f16__2_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_1_1() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_u_1() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_0_1() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_2_1() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__u_2_2() { +; GFX9-LABEL: s_shuffle_v3f16_v2f16__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__0_2_2() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__1_2_2() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__2_2_2() { +; GFX9-LABEL: s_shuffle_v3f16_v2f16__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_2_2() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_u_2() { +; GFX9-LABEL: s_shuffle_v3f16_v2f16__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_0_2() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_1_2() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__u_3_3() { +; GFX9-LABEL: s_shuffle_v3f16_v2f16__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__0_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__1_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__2_3_3() { +; GFX9-LABEL: s_shuffle_v3f16_v2f16__2_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_u_3() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_0_3() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_1_3() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v2f16__3_2_3() { +; GFX900-LABEL: s_shuffle_v3f16_v2f16__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v2f16__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v2f16__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll new file mode 100644 index 0000000000000..fb65270238940 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v3f16.ll @@ -0,0 +1,9009 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3f16_v3f16__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f16_v3f16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> poison + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f16_v3f16__3_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> zeroinitializer + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v3, v2, s[16:17] +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v2 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f16_v3f16__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f16_v3f16__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v3f16__5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v3f16__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v3f16__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v3f16__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v3f16_v3f16__u_u_u() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> poison + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__0_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__1_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__2_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__3_u_u() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__3_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__4_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_0_u() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_1_u() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_2_u() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_3_u() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_4_u() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_5_u() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_5_0() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_5_1() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_5_2() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_5_3() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_5_4() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_5_5() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__u_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__0_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> zeroinitializer + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__1_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__2_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__3_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__4_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_u_0() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_1_0() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_2_0() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_3_0() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_4_0() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__u_1_1() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__u_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__0_1_1() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__0_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__1_1_1() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__2_1_1() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__3_1_1() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__3_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__4_1_1() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_1_1() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_u_1() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_0_1() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_2_1() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_3_1() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_4_1() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__u_2_2() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__0_2_2() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__0_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__1_2_2() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__2_2_2() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__3_2_2() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__4_2_2() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_2_2() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_u_2() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_0_2() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_1_2() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_3_2() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_4_2() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__u_3_3() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__0_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__1_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__2_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__3_3_3() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <3 x i32> + %2 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %2) + ret void +} + +define void @s_shuffle_v3f16_v3f16__4_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_u_3() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_0_3() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_1_3() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_2_3() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_4_3() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__u_4_4() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__0_4_4() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__1_4_4() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__2_4_4() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__3_4_4() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__3_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__4_4_4() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_4_4() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_u_4() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_0_4() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_1_4() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_2_4() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_3_4() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__u_5_5() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__u_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__0_5_5() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__1_5_5() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__2_5_5() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__3_5_5() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__3_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__4_5_5() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_u_5() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__5_u_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_0_5() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_1_5() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_2_5() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_3_5() { +; GFX9-LABEL: s_shuffle_v3f16_v3f16__5_3_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} + +define void @s_shuffle_v3f16_v3f16__5_4_5() { +; GFX900-LABEL: s_shuffle_v3f16_v3f16__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v3f16__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v3f16__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <3 x i32> + %3 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %3) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v4f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v4f16.ll new file mode 100644 index 0000000000000..6a7443ce2c7d6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f16.v4f16.ll @@ -0,0 +1,15446 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3f16_v4f16__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f16_v4f16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> poison + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f16_v4f16__4_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> zeroinitializer + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v2, v2, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v2, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f16_v4f16__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f16_v4f16__4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v3, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v3, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3f16_v4f16__7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f16_v4f16__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f16_v4f16__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f16_v4f16__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + store <3 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v3f16_v4f16__u_u_u() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> poison + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__0_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__1_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__2_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__3_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__4_u_u() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__4_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__5_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__6_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_u_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_0_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_1_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_2_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_3_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_4_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_5_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_6_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_7_u() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_7_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_7_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_7_2() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_7_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_7_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_7_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_7_6() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_7_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__u_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__0_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> zeroinitializer + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__1_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__2_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__3_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__4_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__5_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__6_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_0_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_u_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_1_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_2_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_3_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_4_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_5_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_6_0() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__u_1_1() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__u_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__0_1_1() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__0_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__1_1_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__2_1_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__3_1_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__4_1_1() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__4_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__5_1_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__6_1_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_1_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_u_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_0_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_2_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_3_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_4_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_5_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_6_1() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__u_2_2() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__0_2_2() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__0_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__1_2_2() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__2_2_2() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__3_2_2() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__4_2_2() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__4_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__5_2_2() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__6_2_2() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_2_2() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_u_2() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_0_2() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_1_2() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_3_2() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_4_2() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_5_2() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_6_2() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__u_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__0_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__1_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__2_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__3_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__4_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__5_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__6_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_3_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_u_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_0_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_1_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_2_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_4_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_5_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_6_3() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__u_4_4() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__0_4_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__1_4_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__2_4_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__3_4_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__4_4_4() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__5_4_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__6_4_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_4_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_u_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_0_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_1_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_2_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_3_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_5_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_6_4() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__u_5_5() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__u_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__0_5_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__1_5_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__2_5_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__3_5_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__4_5_5() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__4_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__5_5_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__6_5_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_5_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_u_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_0_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_1_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_2_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_3_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_4_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_6_5() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__u_6_6() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__u_6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__0_6_6() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__1_6_6() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__2_6_6() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__3_6_6() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__4_6_6() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__4_6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__5_6_6() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__6_6_6() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__6_6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_6_6() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_u_6() { +; GFX9-LABEL: s_shuffle_v3f16_v4f16__7_u_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_0_6() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_1_6() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_2_6() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_3_6() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_4_6() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_5_6() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__u_7_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__0_7_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__1_7_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__2_7_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__3_7_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__4_7_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__5_7_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__6_7_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_u_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_0_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_1_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_2_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_3_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_4_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_5_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} + +define void @s_shuffle_v3f16_v4f16__7_6_7() { +; GFX900-LABEL: s_shuffle_v3f16_v4f16__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f16_v4f16__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f16_v4f16__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <3 x i32> + %1 = shufflevector <3 x half> %shuf, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %1) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll new file mode 100644 index 0000000000000..63eec2164916a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v2f32.ll @@ -0,0 +1,4166 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3f32_v2f32__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f32_v2f32__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> poison + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f32_v2f32__2_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> zeroinitializer + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f32_v2f32__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f32_v2f32__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v2f32__3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v2f32__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v2f32__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v2f32__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v3f32_v2f32__u_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__0_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__1_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__2_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_0_u() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_1_u() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_2_u() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_3_u() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_3_0() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_3_1() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_3_2() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__u_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__0_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__1_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__2_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_u_0() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_1_0() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_2_0() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__u_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__0_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__1_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__2_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_u_1() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_0_1() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_2_1() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__u_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__0_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__1_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__2_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_u_2() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_0_2() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_1_2() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__u_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__0_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__1_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__2_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_u_3() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_0_3() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_1_3() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v2f32__3_2_3() { +; GFX900-LABEL: s_shuffle_v3f32_v2f32__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v2f32__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v2f32__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll new file mode 100644 index 0000000000000..1903674300ee9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v3f32.ll @@ -0,0 +1,8883 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3f32_v3f32__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f32_v3f32__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> poison + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f32_v3f32__3_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> zeroinitializer + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f32_v3f32__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f32_v3f32__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v3f32__5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v3f32__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v3f32__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v3f32__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v3f32_v3f32__u_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__0_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__1_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__2_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__3_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__4_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_0_u() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_1_u() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_2_u() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_3_u() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_4_u() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_5_u() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_5_0() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_5_1() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_5_2() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_5_3() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_5_4() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__u_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__0_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__1_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__2_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__3_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__4_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_u_0() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_1_0() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_2_0() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_3_0() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_4_0() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__u_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__0_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__1_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__2_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__3_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__4_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_u_1() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_0_1() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_2_1() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_3_1() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_4_1() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__u_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__0_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__1_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__2_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__3_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__4_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_u_2() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_0_2() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_1_2() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_3_2() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_4_2() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__u_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__0_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__1_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__2_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__3_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__4_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_u_3() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_0_3() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_1_3() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_2_3() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_4_3() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__u_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__0_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__1_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__2_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__3_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__4_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_u_4() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_0_4() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_1_4() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_2_4() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_3_4() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__u_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__0_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__1_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__2_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__3_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__4_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_u_5() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_0_5() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_1_5() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_2_5() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_3_5() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v3f32__5_4_5() { +; GFX900-LABEL: s_shuffle_v3f32_v3f32__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v3f32__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v3f32__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll new file mode 100644 index 0000000000000..5c78a6e702b22 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3f32.v4f32.ll @@ -0,0 +1,15324 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3f32_v4f32__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f32_v4f32__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> poison + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f32_v4f32__4_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> zeroinitializer + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f32_v4f32__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3f32_v4f32__4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3f32_v4f32__7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3f32_v4f32__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3f32_v4f32__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3f32_v4f32__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + store <3 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v3f32_v4f32__u_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__0_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__1_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__2_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__3_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__4_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__5_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__6_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_u_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_0_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_1_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_2_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_3_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_4_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_5_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_6_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_7_u() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_7_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_7_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_7_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_7_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_7_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_7_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_7_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_7_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__u_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__0_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__1_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__2_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__3_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__4_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__5_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__6_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_0_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_u_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_1_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_2_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_3_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_4_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_5_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_6_0() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__u_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__0_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__1_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__2_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__3_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__4_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__5_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__6_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_1_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_u_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_0_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_2_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_3_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_4_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_5_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_6_1() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__u_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__0_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__1_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__2_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__3_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__4_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__5_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__6_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_2_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_u_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_0_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_1_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_3_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_4_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_5_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_6_2() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__u_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__0_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__1_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__2_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__3_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__4_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__5_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__6_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_3_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_u_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_0_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_1_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_2_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_4_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_5_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_6_3() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__u_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__0_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__1_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__2_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__3_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__4_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__5_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__6_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_4_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_u_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_0_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_1_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_2_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_3_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_5_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_6_4() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__u_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__0_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__1_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__2_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__3_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__4_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__5_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__6_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_5_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_u_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_0_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_1_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_2_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_3_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_4_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_6_5() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__u_6_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__0_6_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__1_6_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__2_6_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__3_6_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__4_6_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__5_6_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__6_6_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_6_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_u_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_0_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_1_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_2_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_3_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_4_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_5_6() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__u_7_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__0_7_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__1_7_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__2_7_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__3_7_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__4_7_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__5_7_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__6_7_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_u_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_0_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_1_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_2_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_3_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_4_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_5_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} + +define void @s_shuffle_v3f32_v4f32__7_6_7() { +; GFX900-LABEL: s_shuffle_v3f32_v4f32__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3f32_v4f32__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3f32_v4f32__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x float> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v2i16.ll new file mode 100644 index 0000000000000..2c1d70e7211a5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v2i16.ll @@ -0,0 +1,3964 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3i16_v2i16__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i16_v2i16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> poison + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i16_v2i16__2_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v2, v2, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v2, v2, s2 +; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> zeroinitializer + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v2, v1, v2, s4 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v1, v2, s4 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v2, s2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v2, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v2, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v2, 16 +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v2, 16 +; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v2, v2, 16 +; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i16_v2i16__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i16_v2i16__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: global_store_short v0, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: global_store_short v0, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: global_store_short v0, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX90A-NEXT: global_store_short v0, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: global_store_short v0, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_dword v0, v1, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: global_store_dword v0, v1, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v2i16__3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v2i16__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v0, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v2i16__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v0, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v0, v2, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v2i16__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v3i16_v2i16__u_u_u() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> poison + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__0_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__1_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__2_u_u() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__2_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_0_u() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_1_u() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_2_u() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_3_u() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_3_0() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_3_1() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_3_2() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__3_3_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__u_0_0() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__u_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__0_0_0() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__0_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> zeroinitializer + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__1_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__2_0_0() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__2_0_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_u_0() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_1_0() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_2_0() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__u_1_1() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__u_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__0_1_1() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__0_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__1_1_1() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__2_1_1() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__2_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_1_1() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_u_1() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_0_1() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_2_1() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__u_2_2() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__0_2_2() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__1_2_2() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__2_2_2() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_2_2() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_u_2() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__3_u_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_0_2() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_1_2() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__u_3_3() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__0_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__1_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__2_3_3() { +; GFX9-LABEL: s_shuffle_v3i16_v2i16__2_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_u_3() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_0_3() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_1_3() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v2i16__3_2_3() { +; GFX900-LABEL: s_shuffle_v3i16_v2i16__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v2i16__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v2i16__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v3i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v3i16.ll new file mode 100644 index 0000000000000..6b35cae4e8e9d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v3i16.ll @@ -0,0 +1,8900 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3i16_v3i16__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i16_v3i16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> poison + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i16_v3i16__3_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> zeroinitializer + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v2, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v2 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v3, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v3, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v3, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i16_v3i16__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i16_v3i16__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_short_d16_hi v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v3i16__5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v3i16__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v3i16__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v3i16__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v3i16_v3i16__u_u_u() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> poison + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__0_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__1_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__2_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__3_u_u() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__3_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__4_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_0_u() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_1_u() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_2_u() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_3_u() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_4_u() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_5_u() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_5_0() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_5_1() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_5_2() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_5_3() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_5_4() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_5_5() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__u_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__0_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> zeroinitializer + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__1_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__2_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__3_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__4_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_u_0() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_1_0() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_2_0() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_3_0() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_4_0() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__u_1_1() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__u_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__0_1_1() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__0_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__1_1_1() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__2_1_1() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__3_1_1() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__3_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__4_1_1() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_1_1() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_u_1() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_0_1() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_2_1() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_3_1() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_4_1() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__u_2_2() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__0_2_2() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__0_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__1_2_2() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__2_2_2() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__3_2_2() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__3_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__4_2_2() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_2_2() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_u_2() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_0_2() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_1_2() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_3_2() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_4_2() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__u_3_3() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__0_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__1_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__2_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__3_3_3() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <3 x i32> + %2 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %2) + ret void +} + +define void @s_shuffle_v3i16_v3i16__4_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_u_3() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_0_3() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_1_3() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_2_3() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_4_3() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__u_4_4() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__0_4_4() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__1_4_4() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__2_4_4() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__3_4_4() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__3_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__4_4_4() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_4_4() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_u_4() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_0_4() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_1_4() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_2_4() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_3_4() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__u_5_5() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__u_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__0_5_5() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__1_5_5() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__2_5_5() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__3_5_5() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__3_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__4_5_5() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_u_5() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__5_u_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_mov_b32 s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_0_5() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_1_5() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_2_5() { +; GFX900-LABEL: s_shuffle_v3i16_v3i16__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v3i16__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v3i16__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_3_5() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__5_3_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} + +define void @s_shuffle_v3i16_v3i16__5_4_5() { +; GFX9-LABEL: s_shuffle_v3i16_v3i16__5_4_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_lh_b32_b16 s10, s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <3 x i32> + %3 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %3) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v4i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v4i16.ll new file mode 100644 index 0000000000000..f447e2e9a3eaf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i16.v4i16.ll @@ -0,0 +1,15161 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3i16_v4i16__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i16_v4i16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> poison + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i16_v4i16__4_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> zeroinitializer + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v3, s2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v2, v2, 16 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v3, 16 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v3, 16 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v2, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v2, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i16_v4i16__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i16_v4i16__4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v3, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX90A-NEXT: global_store_short v4, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX940-NEXT: global_store_short v4, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_short_d16_hi v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short_d16_hi v3, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short_d16_hi v4, v2, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short_d16_hi v4, v2, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_short_d16_hi v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_short_d16_hi v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v1, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_short v3, v2, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_short v4, v3, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_short v4, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_short_d16_hi v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_short_d16_hi v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dword v2, v1, s[16:17] +; GFX900-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: global_store_dword v2, v1, s[16:17] +; GFX90A-NEXT: global_store_short v2, v0, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: global_store_dword v2, v1, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v0, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: global_store_dword v3, v0, s[16:17] +; GFX900-NEXT: global_store_short v3, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX900-NEXT: global_store_dword v4, v0, s[16:17] +; GFX900-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: global_store_dword v4, v0, s[16:17] +; GFX90A-NEXT: global_store_short v4, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX940-NEXT: global_store_dword v4, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v4, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v3, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v3, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v3, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v3i16_v4i16__7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i16_v4i16__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX900-NEXT: global_store_dword v2, v0, s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i16_v4i16__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_short v2, v1, s[16:17] offset:4 +; GFX90A-NEXT: global_store_dword v2, v0, s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i16_v4i16__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_short v2, v1, s[0:1] offset:4 sc0 sc1 +; GFX940-NEXT: global_store_dword v2, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + store <3 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v3i16_v4i16__u_u_u() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> poison + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__0_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__1_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__2_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__3_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__4_u_u() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__4_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__5_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__6_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_u_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_0_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_1_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_2_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_3_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_4_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_5_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_6_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_7_u() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_7_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_7_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_7_2() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_7_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_7_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_7_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_7_6() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__7_7_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_7_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__u_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__0_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> zeroinitializer + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__1_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__2_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__3_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__4_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__5_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__6_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_0_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_u_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_1_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_2_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_3_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_4_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_5_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_6_0() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__u_1_1() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__u_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__0_1_1() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__0_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__1_1_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__2_1_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__3_1_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__4_1_1() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__4_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__5_1_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__6_1_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_1_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_u_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_0_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_2_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_3_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_4_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_5_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_6_1() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__u_2_2() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__0_2_2() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__0_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__1_2_2() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__2_2_2() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__3_2_2() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__4_2_2() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__4_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__5_2_2() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__6_2_2() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_2_2() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_u_2() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_0_2() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_1_2() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_3_2() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_4_2() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_5_2() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_6_2() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__u_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__0_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__1_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__2_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__3_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__4_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__5_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__6_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_3_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_u_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_0_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_1_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_2_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_4_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_5_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_6_3() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__u_4_4() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__0_4_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__1_4_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__2_4_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__3_4_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__4_4_4() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__5_4_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__6_4_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_4_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_u_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_0_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_1_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_2_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_3_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_5_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_6_4() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__u_5_5() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__u_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__0_5_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__1_5_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__2_5_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__3_5_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__4_5_5() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__4_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__5_5_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__6_5_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_5_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_u_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_0_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_1_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_2_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_3_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_lshr_b32 s11, s6, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s6, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_lshr_b32 s11, s2, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_4_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_6_5() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__u_6_6() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__u_6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshl_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__0_6_6() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__1_6_6() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__2_6_6() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__3_6_6() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__4_6_6() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__4_6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__5_6_6() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__6_6_6() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__6_6_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_6_6() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_u_6() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__7_u_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_lshr_b32 s10, s11, 16 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_0_6() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_1_6() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_2_6() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_3_6() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_4_6() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_5_6() { +; GFX9-LABEL: s_shuffle_v3i16_v4i16__7_5_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s10, s11, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__u_7_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__0_7_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__1_7_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__2_7_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__3_7_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__4_7_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__5_7_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__6_7_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_u_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_0_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_1_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_2_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_3_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_lshr_b32 s11, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_lshr_b32 s11, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_lshr_b32 s11, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_4_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_5_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} + +define void @s_shuffle_v3i16_v4i16__7_6_7() { +; GFX900-LABEL: s_shuffle_v3i16_v4i16__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i16_v4i16__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i16_v4i16__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <3 x i32> + %1 = shufflevector <3 x i16> %shuf, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %1) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll new file mode 100644 index 0000000000000..d0ac02aab4a3a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v2i32.ll @@ -0,0 +1,4166 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3i32_v2i32__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i32_v2i32__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> poison + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i32_v2i32__2_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> zeroinitializer + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i32_v2i32__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i32_v2i32__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v2i32__3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v2i32__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v2i32__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v2i32__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v3i32_v2i32__u_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__0_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__1_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__2_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_0_u() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_1_u() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_2_u() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_3_u() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_3_0() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_3_1() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_3_2() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__u_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__0_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__1_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__2_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_u_0() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_1_0() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_2_0() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__u_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__0_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__1_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__2_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_u_1() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_0_1() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_2_1() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__u_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__0_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__1_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__2_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_u_2() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_0_2() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_1_2() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__u_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__0_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__1_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__2_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_u_3() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_0_3() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_1_3() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v2i32__3_2_3() { +; GFX900-LABEL: s_shuffle_v3i32_v2i32__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v2i32__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v2i32__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll new file mode 100644 index 0000000000000..321a3c0df0e7d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v3i32.ll @@ -0,0 +1,8883 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3i32_v3i32__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i32_v3i32__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> poison + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i32_v3i32__3_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> zeroinitializer + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i32_v3i32__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i32_v3i32__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v3i32__5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v3i32__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v3i32__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v3i32__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v3i32_v3i32__u_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__0_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__1_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__2_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__3_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__4_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_0_u() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_1_u() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_2_u() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_3_u() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_4_u() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_5_u() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_5_0() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_5_1() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_5_2() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_5_3() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_5_4() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__u_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__0_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__1_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__2_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__3_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__4_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_u_0() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_1_0() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_2_0() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_3_0() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_4_0() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__u_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__0_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__1_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__2_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__3_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__4_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_u_1() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_0_1() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_2_1() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_3_1() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_4_1() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__u_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__0_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__1_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__2_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__3_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__4_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_u_2() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_0_2() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_1_2() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_3_2() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_4_2() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__u_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__0_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__1_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__2_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__3_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__4_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_u_3() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_0_3() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_1_3() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_2_3() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_4_3() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__u_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__0_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__1_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__2_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__3_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__4_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_u_4() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_0_4() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_1_4() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_2_4() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_3_4() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__u_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__0_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__1_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__2_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__3_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__4_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_u_5() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_0_5() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_1_5() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_2_5() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_3_5() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v3i32__5_4_5() { +; GFX900-LABEL: s_shuffle_v3i32_v3i32__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v3i32__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v3i32__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll new file mode 100644 index 0000000000000..4dd8ce449626b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i32.v4i32.ll @@ -0,0 +1,15324 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3i32_v4i32__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i32_v4i32__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> poison + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i32_v4i32__4_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> zeroinitializer + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i32_v4i32__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i32_v4i32__4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3i32_v4i32__7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i32_v4i32__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i32_v4i32__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i32_v4i32__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + store <3 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v3i32_v4i32__u_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__0_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__1_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__2_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__3_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__4_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__5_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__6_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_u_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_0_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_1_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_2_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_3_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_4_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_5_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_6_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_7_u() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_7_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_7_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_7_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_7_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_7_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_7_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_7_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_7_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__u_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__0_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__1_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__2_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__3_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__4_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__5_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__6_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_0_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_u_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_1_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_2_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_3_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_4_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_5_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_6_0() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__u_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__0_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__1_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__2_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__3_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__4_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__5_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__6_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_1_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_u_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_0_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_2_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_3_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_4_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_5_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_6_1() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__u_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__0_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__1_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__2_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__3_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__4_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__5_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__6_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_2_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_u_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_0_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_1_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_3_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_4_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_5_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_6_2() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__u_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__0_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__1_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__2_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__3_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__4_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__5_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__6_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_3_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_u_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_0_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_1_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_2_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_4_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_5_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_6_3() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__u_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__0_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__1_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__2_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__3_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__4_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__5_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__6_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_4_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_u_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_0_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_1_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_2_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_3_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_5_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_6_4() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__u_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__0_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__1_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__2_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__3_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__4_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__5_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__6_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_5_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_u_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_0_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_1_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_2_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_3_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_4_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_6_5() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__u_6_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__0_6_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__1_6_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__2_6_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__3_6_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__4_6_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__5_6_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__6_6_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_6_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_u_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_0_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_1_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_2_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_3_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_4_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_5_6() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__u_7_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__0_7_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__1_7_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__2_7_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__3_7_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__4_7_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__5_7_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__6_7_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_u_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_0_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_1_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_2_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_3_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_4_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_5_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} + +define void @s_shuffle_v3i32_v4i32__7_6_7() { +; GFX900-LABEL: s_shuffle_v3i32_v4i32__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i32_v4i32__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i32_v4i32__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x i32> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll new file mode 100644 index 0000000000000..5d6deb4b33093 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll @@ -0,0 +1,4508 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3i64_v2i64__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i64_v2i64__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> poison + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i64_v2i64__2_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> zeroinitializer + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i64_v2i64__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i64_v2i64__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v2i64__3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v2i64__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v2i64__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v2i64__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @s_shuffle_v3i64_v2i64__u_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__0_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__1_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__2_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_0_u() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_1_u() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_2_u() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_3_u() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_3_0() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_3_1() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_3_2() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__u_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__0_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__1_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__2_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_u_0() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_1_0() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_2_0() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__u_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__0_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__1_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__2_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_u_1() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_0_1() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_2_1() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__u_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__0_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__1_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__2_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_u_2() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_0_2() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_1_2() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__u_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__0_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__1_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__2_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_u_3() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_0_3() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_1_3() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v2i64__3_2_3() { +; GFX900-LABEL: s_shuffle_v3i64_v2i64__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v2i64__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v2i64__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll new file mode 100644 index 0000000000000..f3fe5e310649a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll @@ -0,0 +1,9583 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3i64_v3i64__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i64_v3i64__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> poison + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i64_v3i64__3_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> zeroinitializer + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i64_v3i64__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i64_v3i64__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v3i64__5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v3i64__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v3i64__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v3i64__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @s_shuffle_v3i64_v3i64__u_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__0_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__1_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__2_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__3_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__4_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_0_u() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_1_u() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_2_u() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_3_u() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_4_u() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_5_u() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_5_0() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_5_1() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_5_2() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_5_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_5_4() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__u_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__0_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__1_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__2_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__3_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__4_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_u_0() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_1_0() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_2_0() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s12 +; GFX940-NEXT: s_mov_b32 s9, s13 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_3_0() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_4_0() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__u_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__0_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__1_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__2_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__3_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__4_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_u_1() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_0_1() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_2_1() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s12 +; GFX940-NEXT: s_mov_b32 s9, s13 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_3_1() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_4_1() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__u_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__0_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__1_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__2_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__3_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__4_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_u_2() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_0_2() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s12 +; GFX940-NEXT: s_mov_b32 s9, s13 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_1_2() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_3_2() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_4_2() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s12 +; GFX940-NEXT: s_mov_b32 s9, s13 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__u_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__0_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__1_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__2_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__3_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__4_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_u_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_0_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_1_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_2_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_4_3() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__u_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__0_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__1_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__2_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__3_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__4_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_u_4() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_0_4() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_1_4() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_2_4() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_3_4() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__u_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__0_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__1_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__2_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s12 +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__3_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__4_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_u_5() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_0_5() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_1_5() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_2_5() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s12 +; GFX940-NEXT: s_mov_b32 s9, s13 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_3_5() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v3i64__5_4_5() { +; GFX900-LABEL: s_shuffle_v3i64_v3i64__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v3i64__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v3i64__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll new file mode 100644 index 0000000000000..82c0f9361073b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll @@ -0,0 +1,16611 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3i64_v4i64__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i64_v4i64__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> poison + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i64_v4i64__4_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> zeroinitializer + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v2 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v14, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v14, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v12, v4 +; GFX940-NEXT: v_mov_b32_e32 v13, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v12 +; GFX900-NEXT: v_mov_b32_e32 v1, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v12 +; GFX90A-NEXT: v_mov_b32_e32 v1, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v12 +; GFX940-NEXT: v_mov_b32_e32 v1, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v12 +; GFX900-NEXT: v_mov_b32_e32 v1, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v12 +; GFX90A-NEXT: v_mov_b32_e32 v1, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v12 +; GFX940-NEXT: v_mov_b32_e32 v1, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v16, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v16, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v12 +; GFX900-NEXT: v_mov_b32_e32 v1, v13 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v12 +; GFX90A-NEXT: v_mov_b32_e32 v1, v13 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v12 +; GFX940-NEXT: v_mov_b32_e32 v1, v13 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v12 +; GFX940-NEXT: v_mov_b32_e32 v9, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v6 +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, v6 +; GFX90A-NEXT: v_mov_b32_e32 v15, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v14, v6 +; GFX940-NEXT: v_mov_b32_e32 v15, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v14 +; GFX940-NEXT: v_mov_b32_e32 v9, v15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v14 +; GFX900-NEXT: v_mov_b32_e32 v11, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v14 +; GFX90A-NEXT: v_mov_b32_e32 v11, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v14 +; GFX940-NEXT: v_mov_b32_e32 v11, v15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i64_v4i64__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3i64_v4i64__4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v6 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3i64_v4i64__7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3i64_v4i64__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3i64_v4i64__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3i64_v4i64__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + store <3 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @s_shuffle_v3i64_v4i64__u_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__0_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__1_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__2_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__3_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__4_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__5_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__6_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_u_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_0_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_1_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_2_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_3_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_4_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_5_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_6_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_7_u() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_7_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_7_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_7_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_7_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_7_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_7_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_7_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_7_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__u_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__0_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__1_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__2_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__3_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__4_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__5_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__6_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_0_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_u_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_1_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_2_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_3_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_4_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_5_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_6_0() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__u_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__0_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__1_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__2_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__3_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__4_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__5_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__6_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_1_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_u_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_0_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_2_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_3_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_4_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_5_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_6_1() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__u_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__0_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__1_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__2_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__3_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__4_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__5_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__6_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_2_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_u_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_0_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_1_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_3_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_4_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_5_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_6_2() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__u_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__0_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__1_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__2_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__3_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__4_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__5_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__6_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_3_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_u_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_0_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_1_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_2_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_4_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_5_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_6_3() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__u_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__0_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__1_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__2_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__3_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__4_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__5_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__6_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_4_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_u_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_0_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_1_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_2_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_3_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_5_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_6_4() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__u_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__0_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__1_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__2_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__3_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__4_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__5_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__6_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_5_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_u_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_0_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_1_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_2_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_3_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_4_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_6_5() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__u_6_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__0_6_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__1_6_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__2_6_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s12 +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__3_6_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s12 +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__4_6_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__5_6_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__6_6_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_6_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_u_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_0_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_1_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_2_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_3_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_4_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_5_6() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__u_7_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__0_7_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__1_7_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__2_7_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__3_7_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__4_7_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__5_7_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__6_7_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_u_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_0_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_1_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_2_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_3_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_4_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_5_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} + +define void @s_shuffle_v3i64_v4i64__7_6_7() { +; GFX900-LABEL: s_shuffle_v3i64_v4i64__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3i64_v4i64__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3i64_v4i64__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x i64> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll new file mode 100644 index 0000000000000..be353407c0dd5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll @@ -0,0 +1,4508 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3p0_v2p0__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p0_v2p0__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> poison + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p0_v2p0__2_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> zeroinitializer + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p0_v2p0__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p0_v2p0__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v2p0__3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v2p0__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v2p0__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v2p0__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @s_shuffle_v3p0_v2p0__u_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__0_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__1_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__2_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_0_u() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_1_u() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_2_u() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_3_u() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_3_0() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_3_1() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_3_2() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__u_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__0_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__1_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__2_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_u_0() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_1_0() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_2_0() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__u_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__0_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__1_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__2_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_u_1() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_0_1() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_2_1() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__u_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__0_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__1_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__2_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_u_2() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_0_2() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_1_2() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__u_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__0_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__1_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__2_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_u_3() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_0_3() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_1_3() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v2p0__3_2_3() { +; GFX900-LABEL: s_shuffle_v3p0_v2p0__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v2p0__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v2p0__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll new file mode 100644 index 0000000000000..efb53f78016d9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll @@ -0,0 +1,9583 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3p0_v3p0__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p0_v3p0__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> poison + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p0_v3p0__3_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> zeroinitializer + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p0_v3p0__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p0_v3p0__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v3p0__5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v3p0__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v3p0__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v3p0__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @s_shuffle_v3p0_v3p0__u_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__0_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__1_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__2_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__3_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__4_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_0_u() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_1_u() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_2_u() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_3_u() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_4_u() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_5_u() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_5_0() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_5_1() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_5_2() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_5_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_5_4() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__u_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__0_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__1_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__2_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__3_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__4_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_u_0() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_1_0() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_2_0() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s12 +; GFX940-NEXT: s_mov_b32 s9, s13 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_3_0() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_4_0() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__u_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__0_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__1_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__2_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__3_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__4_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_u_1() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_0_1() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_2_1() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s12 +; GFX940-NEXT: s_mov_b32 s9, s13 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_3_1() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_4_1() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__u_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__0_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__1_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__2_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__3_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__4_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_u_2() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_0_2() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s12 +; GFX940-NEXT: s_mov_b32 s9, s13 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_1_2() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_3_2() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_4_2() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s12 +; GFX940-NEXT: s_mov_b32 s9, s13 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__u_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__0_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__1_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__2_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__3_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__4_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_u_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_0_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_1_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_2_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_4_3() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__u_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__0_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__1_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__2_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__3_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__4_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_u_4() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_0_4() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_1_4() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_2_4() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_3_4() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__u_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__0_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__1_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__2_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s12 +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__3_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__4_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_u_5() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_0_5() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_1_5() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_2_5() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s16 +; GFX900-NEXT: s_mov_b32 s13, s17 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s16 +; GFX90A-NEXT: s_mov_b32 s13, s17 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s12 +; GFX940-NEXT: s_mov_b32 s9, s13 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_3_5() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v3p0__5_4_5() { +; GFX900-LABEL: s_shuffle_v3p0_v3p0__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v3p0__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v3p0__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll new file mode 100644 index 0000000000000..324c617c2e7c4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll @@ -0,0 +1,16611 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3p0_v4p0__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p0_v4p0__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> poison + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p0_v4p0__4_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> zeroinitializer + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v2 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v14, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v14, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v12, v4 +; GFX940-NEXT: v_mov_b32_e32 v13, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v12 +; GFX900-NEXT: v_mov_b32_e32 v1, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v12 +; GFX90A-NEXT: v_mov_b32_e32 v1, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v12 +; GFX940-NEXT: v_mov_b32_e32 v1, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v12 +; GFX900-NEXT: v_mov_b32_e32 v1, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v12 +; GFX90A-NEXT: v_mov_b32_e32 v1, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v12 +; GFX940-NEXT: v_mov_b32_e32 v1, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v16, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v16, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v12 +; GFX900-NEXT: v_mov_b32_e32 v1, v13 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v12 +; GFX90A-NEXT: v_mov_b32_e32 v1, v13 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v12 +; GFX940-NEXT: v_mov_b32_e32 v1, v13 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v12 +; GFX940-NEXT: v_mov_b32_e32 v9, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v6 +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, v6 +; GFX90A-NEXT: v_mov_b32_e32 v15, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v14, v6 +; GFX940-NEXT: v_mov_b32_e32 v15, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v14 +; GFX940-NEXT: v_mov_b32_e32 v9, v15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v14 +; GFX900-NEXT: v_mov_b32_e32 v11, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v14 +; GFX90A-NEXT: v_mov_b32_e32 v11, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v16, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v14 +; GFX940-NEXT: v_mov_b32_e32 v11, v15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p0_v4p0__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p0_v4p0__4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v10, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v6 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[12:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[12:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[4:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[4:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v10, v[8:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx2 v12, v[10:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx2 v14, v[12:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v14, v[12:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx2 v16, v[14:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v16, v[14:15], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v3p0_v4p0__7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p0_v4p0__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p0_v4p0__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx2 v8, v[6:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p0_v4p0__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx2 v8, v[6:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + store <3 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @s_shuffle_v3p0_v4p0__u_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__0_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__1_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__2_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__3_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__4_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__5_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__6_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_u_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_0_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_1_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_2_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_3_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_4_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_5_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_6_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_7_u() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_7_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_7_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_7_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_7_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_7_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_7_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_7_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_7_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__u_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__0_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__1_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__2_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__3_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__4_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__5_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__6_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_0_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_u_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_1_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_2_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_3_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_4_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_5_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_6_0() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__u_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__0_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__1_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__2_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__3_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__4_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__5_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__6_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_1_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_u_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_0_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_2_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_3_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_4_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_5_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_6_1() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__u_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__0_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__1_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__2_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__3_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__4_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__5_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__6_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_2_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_u_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_0_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_1_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_3_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_4_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_5_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_6_2() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__u_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__0_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__1_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__2_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__3_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__4_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__5_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__6_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_3_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_u_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_0_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_1_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_2_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_4_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_5_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_6_3() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__u_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__0_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__1_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__2_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__3_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__4_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__5_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__6_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_4_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_u_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_0_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_1_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_2_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_3_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_5_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_6_4() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__u_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__0_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__1_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__2_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__3_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__4_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__5_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__6_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_5_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_u_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_0_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_1_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_2_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_3_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_4_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_6_5() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__u_6_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__0_6_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__1_6_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__2_6_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s12 +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__3_6_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s12 +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__4_6_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__5_6_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__6_6_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_6_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_u_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_0_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_1_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_2_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_3_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_4_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_5_6() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__u_7_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__0_7_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__1_7_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__2_7_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__3_7_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__4_7_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__5_7_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__6_7_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_u_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_0_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_1_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_2_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_3_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_4_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_5_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v3p0_v4p0__7_6_7() { +; GFX900-LABEL: s_shuffle_v3p0_v4p0__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p0_v4p0__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p0_v4p0__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:15]}"(<3 x ptr> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll new file mode 100644 index 0000000000000..f7d904ff71ef9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v2p3.ll @@ -0,0 +1,4166 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3p3_v2p3__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p3_v2p3__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> poison + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p3_v2p3__2_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> zeroinitializer + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v0, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p3_v2p3__u_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v2, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v2, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p3_v2p3__2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v1, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v1, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v2p3__3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v2p3__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v2p3__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v2p3__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v3p3_v2p3__u_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__0_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__1_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__2_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_0_u() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_1_u() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_2_u() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_3_u() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_3_0() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_3_1() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_3_2() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__u_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__0_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__1_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__2_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_u_0() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_1_0() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_2_0() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__u_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__0_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__1_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__2_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_u_1() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_0_1() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_2_1() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__u_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__0_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__1_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__2_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_u_2() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_0_2() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_1_2() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__u_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__0_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__1_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__2_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_u_3() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_0_3() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_1_3() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v2p3__3_2_3() { +; GFX900-LABEL: s_shuffle_v3p3_v2p3__3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v2p3__3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v2p3__3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll new file mode 100644 index 0000000000000..9f71130d6ca0c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v3p3.ll @@ -0,0 +1,8883 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3p3_v3p3__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p3_v3p3__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> poison + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p3_v3p3__3_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> zeroinitializer + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p3_v3p3__u_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p3_v3p3__3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v3, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v3, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v5, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v3p3__5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v3p3__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v3p3__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v3p3__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v3p3_v3p3__u_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__0_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__1_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__2_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__3_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__4_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_0_u() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_1_u() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_2_u() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_3_u() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_4_u() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_5_u() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_5_0() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_5_1() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_5_2() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_5_3() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_5_4() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__u_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__0_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__1_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__2_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__3_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__4_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_u_0() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_1_0() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_2_0() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_3_0() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_4_0() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__u_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__0_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__1_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__2_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__3_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__4_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_u_1() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_0_1() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_2_1() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_3_1() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_4_1() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__u_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__0_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__1_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__2_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__3_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__4_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_u_2() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_0_2() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_1_2() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_3_2() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_4_2() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__u_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__0_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__1_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__2_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__3_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__4_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_u_3() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_0_3() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_1_3() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_2_3() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_4_3() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__u_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__0_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__1_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__2_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__3_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__4_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_u_4() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_0_4() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_1_4() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_2_4() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_3_4() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__u_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__0_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__1_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__2_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__3_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__4_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_u_5() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_0_5() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_1_5() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_2_5() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_3_5() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v3p3__5_4_5() { +; GFX900-LABEL: s_shuffle_v3p3_v3p3__5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v3p3__5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v3p3__5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll new file mode 100644 index 0000000000000..ca204e46b4264 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v3p3.v4p3.ll @@ -0,0 +1,15324 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v3p3_v4p3__u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p3_v4p3__u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> poison + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p3_v4p3__4_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> zeroinitializer + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v8, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v8, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v7, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v8, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p3_v4p3__u_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v3p3_v4p3__4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx3 v5, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx3 v8, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: global_store_dwordx3 v6, v[3:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx3 v5, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx3 v5, v[2:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[2:4], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[2:4], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx3 v6, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx3 v7, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx3 v8, v[5:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx3 v8, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx3 v8, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx3 v7, v[4:6], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx3 v7, v[4:6], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v3p3_v4p3__7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v3p3_v4p3__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx3 v4, v[1:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v3p3_v4p3__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx3 v4, v[0:2], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v3p3_v4p3__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + store <3 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v3p3_v4p3__u_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> poison + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__0_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__1_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__2_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__3_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__4_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__5_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__6_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_u_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_0_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_1_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_2_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_3_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_4_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_5_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_6_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_7_u() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_7_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_7_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_7_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_7_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_7_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_7_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_7_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_7_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__u_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__0_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__1_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__2_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__3_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__4_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__5_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__6_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_0_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_u_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_1_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_2_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_3_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_4_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_5_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_6_0() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__u_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__0_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__1_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__2_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__3_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__4_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__5_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__6_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_1_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_u_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_0_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_2_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_3_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_4_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_5_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_6_1() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__u_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__0_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__1_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__2_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__3_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__4_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__5_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__6_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_2_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_u_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_0_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_1_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_3_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_4_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_5_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_6_2() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__u_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__0_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__1_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__2_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__3_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__4_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__5_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__6_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_3_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_u_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_0_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_1_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_2_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_4_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_5_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_6_3() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__u_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__0_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__1_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__2_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__3_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__4_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__5_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__6_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_4_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_u_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_0_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_1_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_2_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_3_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_5_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_6_4() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__u_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__0_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__1_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__2_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__3_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__4_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__5_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__6_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_5_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_u_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_0_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_1_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_2_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_3_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_4_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_6_5() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__u_6_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__0_6_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__1_6_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__2_6_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__3_6_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__4_6_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__5_6_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__6_6_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_6_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_u_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_0_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_1_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_2_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_3_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_4_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_5_6() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__u_7_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__0_7_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__1_7_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__2_7_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__3_7_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__4_7_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__5_7_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__6_7_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_u_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_0_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_1_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_2_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_3_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_4_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_5_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v3p3_v4p3__7_6_7() { +; GFX900-LABEL: s_shuffle_v3p3_v4p3__7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v3p3_v4p3__7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v3p3_v4p3__7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <3 x i32> + call void asm sideeffect "; use $0", "{s[10:12]}"(<3 x ptr addrspace(3)> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v2bf16.ll new file mode 100644 index 0000000000000..a9e8852f04779 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v2bf16.ll @@ -0,0 +1,6535 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4bf16_v2bf16__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4bf16_v2bf16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> poison + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4bf16_v2bf16__2_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> zeroinitializer + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4bf16_v2bf16__u_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4bf16_v2bf16__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v3 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v2bf16__3_3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v2bf16__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=v"() + %vec1 = call <2 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__u_u_u_u() { +; GFX9-LABEL: s_shuffle_v4bf16_v2bf16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__2_u_u_u() { +; GFX9-LABEL: s_shuffle_v4bf16_v2bf16__2_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_0_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_1_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_2_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_0_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_1_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_2_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_3_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_3_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_3_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_3_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_u_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_1_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_2_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_u_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_1_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_2_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_u_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_0_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_2_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_u_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_0_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_2_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__u_2_2_2() { +; GFX9-LABEL: s_shuffle_v4bf16_v2bf16__u_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__2_2_2_2() { +; GFX9-LABEL: s_shuffle_v4bf16_v2bf16__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_u_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_0_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_1_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_u_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_0_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_1_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_u_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s10, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s10, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s10, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_0_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_1_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_2_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_u_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_0_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_1_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v2bf16__3_3_2_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v2bf16__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x bfloat> asm "; def $0", "=s"() + %vec1 = call <2 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <2 x bfloat> %vec0, <2 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll new file mode 100644 index 0000000000000..a423ef40d8f76 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll @@ -0,0 +1,14253 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4bf16_v3bf16__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4bf16_v3bf16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> poison + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4bf16_v3bf16__3_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v3, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v3, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> zeroinitializer + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v2, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v2, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4bf16_v3bf16__u_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4bf16_v3bf16__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v0, v0, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, v0, v0, 16 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, v0, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v3bf16__5_5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__u_u_u_u() { +; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__3_u_u_u() { +; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__3_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_0_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_1_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_2_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_3_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_4_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_0_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_1_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_2_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_3_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_4_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_5_u() { +; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_5_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_5_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_5_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_5_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_5_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_u_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_1_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_2_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_3_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_4_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_u_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_1_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_2_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_3_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_4_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_u_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_0_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_2_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_3_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_4_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_u_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_0_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_2_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_3_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_4_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_u_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_0_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_1_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_3_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_4_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_u_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_0_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_1_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_3_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_4_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__u_3_3_3() { +; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__u_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__3_3_3_3() { +; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_u_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_0_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_1_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_2_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_4_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_u_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_0_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_1_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_2_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_4_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__u_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__4_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_u_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_0_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_1_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_2_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_3_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_u_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_0_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_1_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_2_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_3_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_u_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_0_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_1_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_2_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_3_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_4_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_u_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_0_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_1_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_2_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_3_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v3bf16__5_5_4_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %1 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> + %2 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> + %shuf = shufflevector <3 x bfloat> %1, <3 x bfloat> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll new file mode 100644 index 0000000000000..7cae3e9215c2a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v4bf16.ll @@ -0,0 +1,24202 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4bf16_v4bf16__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4bf16_v4bf16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> poison + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4bf16_v4bf16__4_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__6_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, s4, v1, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, s0, v1, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, v0, v1, 16 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, v0, v1, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> zeroinitializer + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__6_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v2, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v2, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__6_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__6_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__6_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v3, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4bf16_v4bf16__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4bf16_v4bf16__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__6_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v2, v2, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v2, v2, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v0, v0, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, v0, v0, 16 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, v0, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__6_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v2, v2, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v2, v2, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v3, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v3, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__u_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__0_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__1_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__2_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__3_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v3, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v3, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__4_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__5_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__6_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v2, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v3, v3, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v3, v3, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__u_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__0_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__1_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__2_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__3_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__4_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__5_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__6_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v2, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v3, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v3, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4bf16_v4bf16__7_7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4bf16_v4bf16__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=v"() + %vec1 = call <4 x bfloat> asm "; def $0", "=v"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__u_u_u_u() { +; GFX9-LABEL: s_shuffle_v4bf16_v4bf16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__4_u_u_u() { +; GFX9-LABEL: s_shuffle_v4bf16_v4bf16__4_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__6_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_u_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_0_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_1_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_2_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_3_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_4_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_5_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_6_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_u_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_0_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_1_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_2_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_3_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_4_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_5_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_6_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_7_u() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_7_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_7_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_7_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_7_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_7_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_7_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_7_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__6_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_0_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_u_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_1_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_2_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_3_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_4_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_5_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_6_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_0_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_u_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_1_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_2_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_3_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_4_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_5_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_6_0() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__6_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_1_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_u_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_0_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_2_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_3_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_4_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_5_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_6_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_1_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_u_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_0_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_2_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_3_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_4_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_5_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_6_1() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__6_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_2_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_u_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_0_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_1_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_3_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_4_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_5_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_6_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_2_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_u_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_lshl_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_lshl_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_0_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_1_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_3_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_4_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_5_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_6_2() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__6_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_3_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_u_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_0_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_1_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_2_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_4_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_5_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_6_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_3_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_u_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_0_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_1_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_2_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_4_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_5_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_6_3() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__u_4_4_4() { +; GFX9-LABEL: s_shuffle_v4bf16_v4bf16__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__4_4_4_4() { +; GFX9-LABEL: s_shuffle_v4bf16_v4bf16__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__6_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_4_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_u_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_0_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_1_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_2_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_3_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_5_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_6_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_4_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_u_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_0_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_1_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_2_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_3_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_5_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_6_4() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__6_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_5_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_u_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_0_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_1_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_2_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_3_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_4_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_6_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_5_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_u_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_0_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_1_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_2_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_3_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_4_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_6_5() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__u_6_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__0_6_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__1_6_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__2_6_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__3_6_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__4_6_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__5_6_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__6_6_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_6_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_u_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_0_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_1_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_2_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_3_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_4_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_5_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_6_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_u_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_lshl_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_lshl_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_0_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_1_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_2_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_3_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_4_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_5_6() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__u_7_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__0_7_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__1_7_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__2_7_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__3_7_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__4_7_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__5_7_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__6_7_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_u_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s10, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s10, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s10, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_0_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_1_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_2_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_3_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_4_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_5_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_6_7_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_u_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_0_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_1_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_2_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_3_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_4_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_5_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} + +define void @s_shuffle_v4bf16_v4bf16__7_7_6_7() { +; GFX900-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4bf16_v4bf16__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x bfloat> asm "; def $0", "=s"() + %vec1 = call <4 x bfloat> asm "; def $0", "=s"() + %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x bfloat> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v2f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v2f16.ll new file mode 100644 index 0000000000000..9e400b1fa1c7a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v2f16.ll @@ -0,0 +1,6535 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4f16_v2f16__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f16_v2f16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> poison + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f16_v2f16__2_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> zeroinitializer + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f16_v2f16__u_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f16_v2f16__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v3 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v2f16__3_3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v2f16__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v2f16__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v2f16__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=v"() + %vec1 = call <2 x half> asm "; def $0", "=v"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v4f16_v2f16__u_u_u_u() { +; GFX9-LABEL: s_shuffle_v4f16_v2f16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__2_u_u_u() { +; GFX9-LABEL: s_shuffle_v4f16_v2f16__2_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_0_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_1_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_2_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_0_u() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_1_u() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_2_u() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_3_u() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_3_0() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_3_1() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_3_2() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_u_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_1_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_2_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_u_0() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_1_0() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_2_0() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_u_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_0_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_2_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_u_1() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_0_1() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_2_1() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__u_2_2_2() { +; GFX9-LABEL: s_shuffle_v4f16_v2f16__u_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__2_2_2_2() { +; GFX9-LABEL: s_shuffle_v4f16_v2f16__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_u_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_0_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_1_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_u_2() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_0_2() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_1_2() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_u_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s10, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s10, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s10, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_0_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_1_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_2_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_u_3() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_0_3() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_1_3() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v2f16__3_3_2_3() { +; GFX900-LABEL: s_shuffle_v4f16_v2f16__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v2f16__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v2f16__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x half> asm "; def $0", "=s"() + %vec1 = call <2 x half> asm "; def $0", "=s"() + %shuf = shufflevector <2 x half> %vec0, <2 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll new file mode 100644 index 0000000000000..2105305e71b17 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v3f16.ll @@ -0,0 +1,14253 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4f16_v3f16__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f16_v3f16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> poison + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f16_v3f16__3_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v3, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v3, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> zeroinitializer + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v2, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v2, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f16_v3f16__u_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f16_v3f16__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v0, v0, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, v0, v0, 16 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, v0, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v3f16__5_5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v3f16__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v3f16__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v3f16__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v4f16_v3f16__u_u_u_u() { +; GFX9-LABEL: s_shuffle_v4f16_v3f16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__3_u_u_u() { +; GFX9-LABEL: s_shuffle_v4f16_v3f16__3_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_0_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_1_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_2_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_3_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_4_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_0_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_1_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_2_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_3_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_4_u() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_5_u() { +; GFX9-LABEL: s_shuffle_v4f16_v3f16__5_5_5_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_5_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_5_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_5_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_5_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_5_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_u_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_1_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_2_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_3_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_4_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_u_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_1_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_2_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_3_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_4_0() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_u_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_0_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_2_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_3_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_4_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_u_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_0_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_2_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_3_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_4_1() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_u_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_0_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_1_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_3_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_4_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_u_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_0_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_1_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_3_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_4_2() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__u_3_3_3() { +; GFX9-LABEL: s_shuffle_v4f16_v3f16__u_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__3_3_3_3() { +; GFX9-LABEL: s_shuffle_v4f16_v3f16__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_u_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_0_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_1_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_2_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_4_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_u_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_0_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_1_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_2_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_4_3() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__u_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__4_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_u_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_0_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_1_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_2_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_3_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_u_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_0_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_1_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_2_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_3_4() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_u_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_0_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_1_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_2_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_3_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_4_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_u_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_0_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_1_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_2_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_3_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v3f16__5_5_4_5() { +; GFX900-LABEL: s_shuffle_v4f16_v3f16__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v3f16__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v3f16__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %1 = shufflevector <4 x half> %vec0, <4 x half> poison, <3 x i32> + %2 = shufflevector <4 x half> %vec1, <4 x half> poison, <3 x i32> + %shuf = shufflevector <3 x half> %1, <3 x half> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll new file mode 100644 index 0000000000000..4a8321fe8bbcb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f16.v4f16.ll @@ -0,0 +1,24202 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4f16_v4f16__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f16_v4f16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> poison + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f16_v4f16__4_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__6_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, s4, v1, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, s0, v1, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, v0, v1, 16 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, v0, v1, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> zeroinitializer + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__6_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v2, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v2, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__6_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__6_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__6_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v3, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f16_v4f16__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f16_v4f16__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__6_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v2, v2, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v2, v2, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v0, v0, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, v0, v0, 16 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, v0, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__6_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v2, v2, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v2, v2, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v3, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v3, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__u_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__0_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__1_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__2_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__3_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v3, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v3, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__4_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__5_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__6_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v2, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v3, v3, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v3, v3, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__u_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__0_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__1_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__2_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__3_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__4_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__5_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__6_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v2, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v3, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v3, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4f16_v4f16__7_7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f16_v4f16__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f16_v4f16__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f16_v4f16__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=v"() + %vec1 = call <4 x half> asm "; def $0", "=v"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + store <4 x half> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v4f16_v4f16__u_u_u_u() { +; GFX9-LABEL: s_shuffle_v4f16_v4f16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__4_u_u_u() { +; GFX9-LABEL: s_shuffle_v4f16_v4f16__4_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__6_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_0_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_1_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_2_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_3_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_4_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_5_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_6_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_u_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_0_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_1_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_2_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_3_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_4_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_5_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_6_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_7_u() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_7_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_7_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_7_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_7_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_7_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_7_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_7_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__6_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_u_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_1_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_2_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_3_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_4_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_5_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_6_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_0_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_u_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_1_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_2_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_3_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_4_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_5_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_6_0() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__6_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_u_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_0_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_2_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_3_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_4_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_5_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_6_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_1_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_u_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_0_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_2_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_3_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_4_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_5_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_6_1() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__6_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_u_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_0_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_1_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_3_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_4_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_5_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_6_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_2_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_u_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_lshl_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_lshl_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_0_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_1_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_3_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_4_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_5_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_6_2() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__6_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_u_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_0_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_1_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_2_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_4_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_5_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_6_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_3_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_u_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_0_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_1_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_2_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_4_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_5_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_6_3() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__u_4_4_4() { +; GFX9-LABEL: s_shuffle_v4f16_v4f16__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__4_4_4_4() { +; GFX9-LABEL: s_shuffle_v4f16_v4f16__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__6_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_u_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_0_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_1_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_2_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_3_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_5_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_6_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_4_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_u_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_0_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_1_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_2_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_3_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_5_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_6_4() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s10, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s10, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s10, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__6_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_u_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_0_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_1_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_2_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_3_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_4_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_6_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_5_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_u_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_0_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_1_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_2_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_3_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_4_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_6_5() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__u_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__0_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__1_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__2_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__3_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__4_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__5_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__6_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_u_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_0_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_1_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_2_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_3_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_4_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_5_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_6_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_u_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_lshl_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_lshl_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_0_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_1_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_2_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_3_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_4_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_5_6() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__u_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__0_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__1_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__2_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__3_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__4_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__5_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__6_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_u_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s10, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s10, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s10, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_0_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_1_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_2_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_3_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_4_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_5_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_6_7_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_u_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_0_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_1_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_2_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_3_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_4_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_5_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} + +define void @s_shuffle_v4f16_v4f16__7_7_6_7() { +; GFX900-LABEL: s_shuffle_v4f16_v4f16__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s11, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f16_v4f16__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s11, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f16_v4f16__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s11, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x half> asm "; def $0", "=s"() + %vec1 = call <4 x half> asm "; def $0", "=s"() + %shuf = shufflevector <4 x half> %vec0, <4 x half> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x half> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll new file mode 100644 index 0000000000000..5208e3c21bc5f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll @@ -0,0 +1,6422 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4f32_v2f32__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f32_v2f32__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> poison + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f32_v2f32__2_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> zeroinitializer + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v0, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f32_v2f32__u_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f32_v2f32__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v2f32__3_3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v2f32__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=v"() + %vec1 = call <2 x float> asm "; def $0", "=v"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v4f32_v2f32__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_0_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_1_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_2_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_0_u() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_1_u() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_2_u() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_3_u() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_3_0() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_3_1() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_3_2() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_u_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_1_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_2_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_u_0() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_1_0() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_2_0() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_u_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_0_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_2_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_u_1() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_0_1() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_2_1() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_u_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_0_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_1_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_u_2() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_0_2() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_1_2() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_u_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_0_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_1_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_2_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_u_3() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_0_3() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_1_3() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v2f32__3_3_2_3() { +; GFX900-LABEL: s_shuffle_v4f32_v2f32__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v2f32__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v2f32__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x float> asm "; def $0", "=s"() + %vec1 = call <2 x float> asm "; def $0", "=s"() + %shuf = shufflevector <2 x float> %vec0, <2 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll new file mode 100644 index 0000000000000..50372fa702a90 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll @@ -0,0 +1,14014 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4f32_v3f32__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f32_v3f32__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> poison + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f32_v3f32__3_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> zeroinitializer + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v9, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v9, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v9, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f32_v3f32__u_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f32_v3f32__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v3f32__5_5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v3f32__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=v"() + %vec1 = call <3 x float> asm "; def $0", "=v"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v4f32_v3f32__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_0_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_1_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_2_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_3_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_4_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_0_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_1_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_2_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_3_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_4_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_5_u() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_5_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_5_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_5_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_5_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_5_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_u_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_1_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_2_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_3_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s12 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s12 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s8 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_4_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_u_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_1_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_2_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_3_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_4_0() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s13 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s13 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s9 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_u_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_0_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_2_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_3_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s12 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s12 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s8 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_4_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_u_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_0_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_2_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_3_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_4_1() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s13 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s13 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s9 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_u_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_0_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_1_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_3_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_4_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_u_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_0_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_1_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_3_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_4_2() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s13 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s13 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s9 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_u_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_0_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_1_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_2_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_4_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_u_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_0_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_1_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_2_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_4_3() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__u_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__4_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_u_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_0_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s13 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s13 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s9 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_1_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_2_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_3_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_u_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_0_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_1_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_2_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_3_4() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_u_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_0_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_1_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_2_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_3_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_4_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_u_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_0_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_1_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_2_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_3_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v3f32__5_5_4_5() { +; GFX900-LABEL: s_shuffle_v4f32_v3f32__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v3f32__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v3f32__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x float> asm "; def $0", "=s"() + %vec1 = call <3 x float> asm "; def $0", "=s"() + %shuf = shufflevector <3 x float> %vec0, <3 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll new file mode 100644 index 0000000000000..e38c885cf23c5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll @@ -0,0 +1,24149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4f32_v4f32__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f32_v4f32__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> poison + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f32_v4f32__4_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__6_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> zeroinitializer + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__6_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__6_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__6_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v2 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v7, v4 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v2 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__6_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f32_v4f32__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4f32_v4f32__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__6_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__6_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__u_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__0_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__1_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__2_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__3_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__4_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__5_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__6_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: v_mov_b32_e32 v8, v3 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v4 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: v_mov_b32_e32 v9, v4 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v4 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__u_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__0_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__1_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__2_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__3_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__4_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__5_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__6_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4f32_v4f32__7_7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4f32_v4f32__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4f32_v4f32__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4f32_v4f32__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=v"() + %vec1 = call <4 x float> asm "; def $0", "=v"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + store <4 x float> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v4f32_v4f32__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__6_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_u_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_0_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_1_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_2_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_3_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_4_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_5_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_6_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_u_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_0_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_1_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_2_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_3_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_4_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_5_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_6_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_7_u() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_7_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_7_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_7_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_7_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_7_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_7_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_7_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__6_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_0_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_u_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_1_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_2_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_3_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_4_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_5_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_6_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_0_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_u_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_1_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_2_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_3_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_4_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_5_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: s_mov_b32 s15, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: s_mov_b32 s15, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_6_0() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__6_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_1_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_u_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_0_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_2_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_3_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_4_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_5_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_6_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_1_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_u_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_0_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_2_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_3_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_4_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_5_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_6_1() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__6_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_2_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_u_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_0_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_1_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_3_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_4_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_5_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_6_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_2_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_u_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_0_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_1_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_3_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_4_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_5_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: s_mov_b32 s15, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: s_mov_b32 s15, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_6_2() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__6_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_3_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_u_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_0_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_1_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_2_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_4_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_5_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_6_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_3_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_u_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_0_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_1_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_2_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_4_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_5_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_6_3() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__u_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__4_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__6_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_4_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_u_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_0_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_1_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_2_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_3_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_5_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_6_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_4_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_u_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_0_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_1_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: s_mov_b32 s15, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: s_mov_b32 s15, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_2_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_3_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_5_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_6_4() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__6_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_5_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_u_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_0_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_1_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_2_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_3_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_4_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_6_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_5_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_u_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_0_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_1_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_2_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_3_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_4_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_6_5() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__u_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__0_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__1_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__2_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__3_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__4_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__5_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__6_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_6_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_u_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_0_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_1_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_2_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_3_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_4_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_5_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_6_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_u_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_0_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_1_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: s_mov_b32 s15, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: s_mov_b32 s15, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_2_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_3_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_4_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_5_6() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__u_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__0_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__1_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__2_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__3_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__4_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__5_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__6_7_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_u_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_0_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_1_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_2_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_3_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_4_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_5_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_6_7_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_u_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_0_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_1_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_2_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_3_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_4_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_5_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} + +define void @s_shuffle_v4f32_v4f32__7_7_6_7() { +; GFX900-LABEL: s_shuffle_v4f32_v4f32__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4f32_v4f32__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4f32_v4f32__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x float> asm "; def $0", "=s"() + %vec1 = call <4 x float> asm "; def $0", "=s"() + %shuf = shufflevector <4 x float> %vec0, <4 x float> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x float> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v2i16.ll new file mode 100644 index 0000000000000..cac44b59a65ae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v2i16.ll @@ -0,0 +1,6199 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4i16_v2i16__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i16_v2i16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> poison + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i16_v2i16__2_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v2 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v2 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> zeroinitializer + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i16_v2i16__u_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i16_v2i16__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v3 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v0 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v3 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v3 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v2i16__3_3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v2i16__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v1 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v2i16__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v2i16__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=v"() + %vec1 = call <2 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v4i16_v2i16__u_u_u_u() { +; GFX9-LABEL: s_shuffle_v4i16_v2i16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__2_u_u_u() { +; GFX9-LABEL: s_shuffle_v4i16_v2i16__2_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_0_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_1_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_2_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_0_u() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_1_u() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_2_u() { +; GFX9-LABEL: s_shuffle_v4i16_v2i16__3_3_2_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_3_u() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_3_0() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_3_1() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_3_2() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_u_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_1_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_2_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_u_0() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_1_0() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_2_0() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__u_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i16_v2i16__u_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__0_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i16_v2i16__0_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__2_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i16_v2i16__2_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_u_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_0_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_2_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_u_1() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_0_1() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s11 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s11 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s11 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_2_1() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__u_2_2_2() { +; GFX9-LABEL: s_shuffle_v4i16_v2i16__u_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s10 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s10 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s10 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__2_2_2_2() { +; GFX9-LABEL: s_shuffle_v4i16_v2i16__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_u_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_0_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_1_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_u_2() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_0_2() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_1_2() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__u_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i16_v2i16__u_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__2_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i16_v2i16__2_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s10 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_u_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_0_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_1_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_2_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_u_3() { +; GFX9-LABEL: s_shuffle_v4i16_v2i16__3_3_u_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_0_3() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_1_3() { +; GFX900-LABEL: s_shuffle_v4i16_v2i16__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s4 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s5 +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v2i16__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v2i16__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s0 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s1 +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v2i16__3_3_2_3() { +; GFX9-LABEL: s_shuffle_v4i16_v2i16__3_3_2_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s11 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i16> asm "; def $0", "=s"() + %vec1 = call <2 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll new file mode 100644 index 0000000000000..89140675a048b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v3i16.ll @@ -0,0 +1,13983 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4i16_v3i16__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i16_v3i16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> poison + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i16_v3i16__3_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v3, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v3, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> zeroinitializer + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v2, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v2, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i16_v3i16__u_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i16_v3i16__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v0, v0, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, v0, v0, 16 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, v0, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v3i16__5_5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v3i16__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v3i16__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v3i16__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v4i16_v3i16__u_u_u_u() { +; GFX9-LABEL: s_shuffle_v4i16_v3i16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__3_u_u_u() { +; GFX9-LABEL: s_shuffle_v4i16_v3i16__3_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_0_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_1_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_2_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_3_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_4_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_0_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_1_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_2_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_3_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_4_u() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_5_u() { +; GFX9-LABEL: s_shuffle_v4i16_v3i16__5_5_5_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_5_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_5_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_5_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_5_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_5_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_u_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_1_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_2_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_3_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_4_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_u_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_1_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_2_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_3_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_4_0() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__u_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i16_v3i16__u_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__0_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i16_v3i16__0_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__3_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i16_v3i16__3_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_u_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_0_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_2_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_3_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_4_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_u_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_0_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_2_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_3_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_4_1() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_u_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_mov_b32 s10, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_0_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_1_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_3_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_4_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_u_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_0_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_1_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_3_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_4_2() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__u_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i16_v3i16__u_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__3_3_3_3() { +; GFX9-LABEL: s_shuffle_v4i16_v3i16__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_u_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_0_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_1_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_2_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_4_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_u_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_0_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_1_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_2_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_4_3() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__u_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i16_v3i16__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__3_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i16_v3i16__3_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__4_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_u_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_0_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_1_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_2_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_3_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_u_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_0_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_1_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_2_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_3_4() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_u_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_0_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_1_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_2_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_3_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_4_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_u_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_0_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_1_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_2_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_3_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v3i16__5_5_4_5() { +; GFX900-LABEL: s_shuffle_v4i16_v3i16__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v3i16__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v3i16__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %1 = shufflevector <4 x i16> %vec0, <4 x i16> poison, <3 x i32> + %2 = shufflevector <4 x i16> %vec1, <4 x i16> poison, <3 x i32> + %shuf = shufflevector <3 x i16> %1, <3 x i16> %2, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll new file mode 100644 index 0000000000000..be155605a220b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i16.v4i16.ll @@ -0,0 +1,23344 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4i16_v4i16__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i16_v4i16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> poison + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i16_v4i16__4_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__6_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v0, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, s4, v0, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, s0, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, s4, v1, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, s0, v1, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v3, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, v0, v1, 16 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, v0, v1, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> zeroinitializer + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__6_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v0, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v0, v2, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v2, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v2, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__6_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v2, v3, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v2, v3, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v0, v2, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v2, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__6_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v1, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v2, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v2, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__6_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v3, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v2, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v2, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v1, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v2, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v2, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v3, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v3, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i16_v4i16__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i16_v4i16__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__6_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v2, v2, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v2, v2, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v2, v0, v0, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v3, v0, v0, 16 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v3, v0, v0, 16 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v2, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v2, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__6_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v2, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v2, v2, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v2, v2, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v2, v2, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v0, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v0, v0, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v0, v0, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v2, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v2, s4, v1, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v3, s4, v1, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v3, s2, v1, v0 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__u_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__0_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__1_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__2_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__3_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v3, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v3, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__4_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__5_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__6_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, s4, v1, 16 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, s0, v1, 16 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v2, v2, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v3, v3, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_perm_b32 v2, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v3, v3, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v1, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v1, v3, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v0, s4 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v0, s4 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x5040100 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v0, s2 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v3, v1, v0, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__u_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__0_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__1_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__2_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v0, s4, v1, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v0, s2, v1, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__3_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__4_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: v_bfi_b32 v0, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__5_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__6_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, s4, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, s0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v2, v3, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; GFX900-NEXT: global_store_dwordx2 v4, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v5, v3, v3, s4 +; GFX90A-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX90A-NEXT: global_store_dwordx2 v6, v[4:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v5, v3, v3, s2 +; GFX940-NEXT: v_alignbit_b32 v4, v1, v3, 16 +; GFX940-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v0, v1, v3, s4 +; GFX90A-NEXT: v_perm_b32 v1, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v3, s2 +; GFX940-NEXT: v_perm_b32 v1, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v0, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX90A-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v0, v1, s2 +; GFX940-NEXT: v_perm_b32 v1, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v1, s4 +; GFX90A-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v1, s2 +; GFX940-NEXT: v_alignbit_b32 v2, v1, v1, 16 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: v_bfi_b32 v1, s4, v0, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v0, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v0, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v2, v0, s4 +; GFX900-NEXT: v_perm_b32 v0, v2, v2, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v0, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v0, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v1, s4, v1, v3 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v1, s2, v1, v3 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX900-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX900-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v1, v3, v1, s4 +; GFX90A-NEXT: v_perm_b32 v0, v3, v3, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v1, v3, v1, s2 +; GFX940-NEXT: v_perm_b32 v0, v3, v3, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xffff +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_bfi_b32 v2, s4, v0, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_bfi_b32 v3, s4, v0, v1 +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0xffff +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_bfi_b32 v3, s2, v0, v1 +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_perm_b32 v2, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v3, v[1:2], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_perm_b32 v3, v1, v0, s4 +; GFX90A-NEXT: v_perm_b32 v2, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_perm_b32 v3, v1, v0, s2 +; GFX940-NEXT: v_perm_b32 v2, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @v_shuffle_v4i16_v4i16__7_7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i16_v4i16__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i16_v4i16__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, 0x7060302 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i16_v4i16__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, 0x7060302 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_perm_b32 v0, v1, v1, s2 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=v"() + %vec1 = call <4 x i16> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + store <4 x i16> %shuf, ptr addrspace(1) %ptr, align 8 + ret void +} + +define void @s_shuffle_v4i16_v4i16__u_u_u_u() { +; GFX9-LABEL: s_shuffle_v4i16_v4i16__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__4_u_u_u() { +; GFX9-LABEL: s_shuffle_v4i16_v4i16__4_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__6_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_0_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_1_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_2_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_3_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_4_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_5_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_6_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_u_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_0_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_1_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_2_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_3_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_4_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_5_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_6_u() { +; GFX9-LABEL: s_shuffle_v4i16_v4i16__7_7_6_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_7_u() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s11, s5, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s11, s5, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s11, s1, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_7_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_7_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_7_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_7_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s7, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s7, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s3, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_7_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_7_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_7_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_lshl_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_lshl_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_lshl_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__6_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_u_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_1_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_2_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_3_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_4_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_5_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_6_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_0_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_u_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_1_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_2_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_3_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_4_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_5_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_6_0() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__u_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i16_v4i16__u_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__0_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i16_v4i16__0_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__4_1_1_1() { +; GFX9-LABEL: s_shuffle_v4i16_v4i16__4_1_1_1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__6_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_u_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_0_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_2_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_3_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_4_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_5_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_6_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_1_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_u_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_0_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_2_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_3_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_4_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_5_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_6_1() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s7, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s7, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s3, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__6_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_u_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_0_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_1_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_3_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_4_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_5_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_6_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_2_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_u_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: s_lshl_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: s_lshl_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: s_lshl_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_0_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_1_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_3_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_4_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_5_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s6, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s2, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_6_2() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__6_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_u_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s7, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s3, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_0_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s7, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s3, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_1_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_2_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_4_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_5_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_6_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_3_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_u_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_0_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_1_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_2_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_4_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s6, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s6, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s2, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_5_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_6_3() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s7, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s7, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s3, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__u_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i16_v4i16__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__4_4_4_4() { +; GFX9-LABEL: s_shuffle_v4i16_v4i16__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__6_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_u_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_0_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_1_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_2_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_3_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_5_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_6_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_4_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_u_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshl_b32 s11, s4, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshl_b32 s11, s4, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshl_b32 s11, s0, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_0_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_1_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_2_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_3_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_5_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s6, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s2, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_6_4() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__u_5_5_5() { +; GFX9-LABEL: s_shuffle_v4i16_v4i16__u_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__4_5_5_5() { +; GFX9-LABEL: s_shuffle_v4i16_v4i16__4_5_5_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s11, s10, s10 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s0 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__6_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_u_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_0_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_1_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_2_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_3_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s6, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s2, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_4_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_6_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_5_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_u_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_0_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_1_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_2_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s5, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s5, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s1, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_3_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s6 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s6 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s2 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_4_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_6_5() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__u_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_lshl_b32 s10, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s10, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_lshl_b32 s10, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__0_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__1_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__2_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__3_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__4_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__5_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__6_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_mov_b32 s11, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_u_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_0_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_1_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_2_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_3_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_4_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_5_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_6_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_u_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: s_lshl_b32 s11, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: s_lshl_b32 s11, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: s_lshl_b32 s11, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_0_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_1_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_2_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s5, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s5, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s1, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_3_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_4_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_5_6() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s4, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s0, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__u_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__0_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__1_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__2_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__3_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__4_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__5_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__6_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_u_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s10, s5, 16 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s10, s5, 16 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s10, s1, 16 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_0_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s5, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s5, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s1, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_1_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_2_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s7, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s7, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s3, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_3_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_4_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s6, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s6, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s6, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s2, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s2, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_5_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s4 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s0 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_6_7_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_lshr_b32 s4, s5, 16 +; GFX900-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_lshr_b32 s4, s5, 16 +; GFX90A-NEXT: s_pack_ll_b32_b16 s10, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_lshr_b32 s0, s1, 16 +; GFX940-NEXT: s_pack_ll_b32_b16 s10, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_u_7() { +; GFX9-LABEL: s_shuffle_v4i16_v4i16__7_7_u_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_0_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_1_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_2_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s5, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s5, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s1, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_3_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s5, s7 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s5, s7 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s7, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s1, s3 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s3, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_4_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_lh_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_lh_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_lh_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_5_7() { +; GFX900-LABEL: s_shuffle_v4i16_v4i16__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_pack_hh_b32_b16 s11, s4, s5 +; GFX900-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[10:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i16_v4i16__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_pack_hh_b32_b16 s11, s4, s5 +; GFX90A-NEXT: s_pack_hh_b32_b16 s10, s5, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[10:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i16_v4i16__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_pack_hh_b32_b16 s11, s0, s1 +; GFX940-NEXT: s_pack_hh_b32_b16 s10, s1, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[10:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} + +define void @s_shuffle_v4i16_v4i16__7_7_6_7() { +; GFX9-LABEL: s_shuffle_v4i16_v4i16__7_7_6_7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; def s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_pack_hh_b32_b16 s10, s11, s11 +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; use s[10:11] +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i16> asm "; def $0", "=s"() + %vec1 = call <4 x i16> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i16> %vec0, <4 x i16> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:11]}"(<4 x i16> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll new file mode 100644 index 0000000000000..975e5027266ff --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll @@ -0,0 +1,6434 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4i32_v2i32__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i32_v2i32__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> poison + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i32_v2i32__2_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> zeroinitializer + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i32_v2i32__u_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i32_v2i32__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v2i32__3_3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v2i32__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v2i32__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v2i32__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=v"() + %vec1 = call <2 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v4i32_v2i32__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_0_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_1_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_2_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_0_u() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_1_u() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_2_u() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_3_u() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_3_0() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_3_1() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_3_2() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_u_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_1_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_2_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_u_0() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_1_0() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_2_0() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_u_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_0_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_2_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_u_1() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_0_1() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_2_1() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_u_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_0_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_1_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_u_2() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_0_2() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_1_2() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_u_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_0_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_1_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_2_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_u_3() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_0_3() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_1_3() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v2i32__3_3_2_3() { +; GFX900-LABEL: s_shuffle_v4i32_v2i32__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v2i32__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v2i32__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i32> asm "; def $0", "=s"() + %vec1 = call <2 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i32> %vec0, <2 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll new file mode 100644 index 0000000000000..192cd286e244b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll @@ -0,0 +1,14014 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4i32_v3i32__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i32_v3i32__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> poison + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i32_v3i32__3_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> zeroinitializer + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v9, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v9, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v9, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i32_v3i32__u_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i32_v3i32__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v3i32__5_5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v3i32__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=v"() + %vec1 = call <3 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v4i32_v3i32__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_0_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_1_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_2_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_3_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_4_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_0_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_1_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_2_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_3_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_4_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_5_u() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_5_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_5_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_5_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_5_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_5_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_u_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_1_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_2_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_3_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s12 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s12 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s8 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_4_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_u_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_1_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_2_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_3_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_4_0() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s13 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s13 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s9 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_u_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_0_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_2_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_3_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s12 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s12 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s8 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_4_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_u_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_0_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_2_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_3_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_4_1() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s13 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s13 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s9 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_u_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_0_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_1_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_3_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_4_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_u_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_0_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_1_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_3_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_4_2() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s13 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s13 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s9 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_u_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_0_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_1_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_2_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_4_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_u_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_0_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_1_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_2_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_4_3() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__u_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__4_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_u_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_0_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s13 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s13 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s9 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_1_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_2_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_3_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_u_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_0_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_1_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_2_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_3_4() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_u_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_0_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_1_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_2_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_3_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_4_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_u_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_0_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_1_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_2_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_3_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v3i32__5_5_4_5() { +; GFX900-LABEL: s_shuffle_v4i32_v3i32__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v3i32__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v3i32__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i32> asm "; def $0", "=s"() + %vec1 = call <3 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i32> %vec0, <3 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll new file mode 100644 index 0000000000000..5db323f95d692 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll @@ -0,0 +1,24149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4i32_v4i32__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i32_v4i32__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> poison + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i32_v4i32__4_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__6_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> zeroinitializer + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__6_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__6_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__6_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v2 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v7, v4 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v2 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__6_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i32_v4i32__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i32_v4i32__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__6_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__6_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__u_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__0_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__1_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__2_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__3_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__4_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__5_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__6_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: v_mov_b32_e32 v8, v3 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v4 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: v_mov_b32_e32 v9, v4 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v4 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__u_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__0_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__1_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__2_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__3_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__4_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__5_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__6_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4i32_v4i32__7_7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i32_v4i32__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i32_v4i32__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i32_v4i32__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=v"() + %vec1 = call <4 x i32> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + store <4 x i32> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v4i32_v4i32__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__6_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_0_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_1_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_2_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_3_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_4_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_5_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_6_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_u_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_0_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_1_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_2_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_3_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_4_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_5_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_6_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_7_u() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_7_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_7_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_7_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_7_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_7_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_7_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_7_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__6_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_u_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_1_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_2_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_3_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_4_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_5_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_6_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_0_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_u_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_1_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_2_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_3_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_4_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_5_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: s_mov_b32 s15, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: s_mov_b32 s15, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_6_0() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__6_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_u_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_0_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_2_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_3_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_4_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_5_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_6_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_1_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_u_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_0_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_2_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_3_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_4_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_5_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_6_1() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__6_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_u_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_0_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_1_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_3_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_4_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_5_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_6_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_2_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_u_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_0_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_1_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_3_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_4_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_5_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: s_mov_b32 s15, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: s_mov_b32 s15, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_6_2() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__6_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_u_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_0_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_1_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_2_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_4_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_5_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_6_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_3_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_u_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_0_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_1_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_2_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_4_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_5_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_6_3() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__u_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__4_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__6_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_u_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_0_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_1_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_2_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_3_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_5_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_6_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_4_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_u_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_0_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_1_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: s_mov_b32 s15, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: s_mov_b32 s15, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_2_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_3_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_5_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_6_4() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__6_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_u_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_0_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_1_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_2_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_3_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_4_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_6_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_5_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_u_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_0_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_1_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_2_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_3_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_4_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_6_5() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__u_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__0_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__1_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__2_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__3_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__4_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__5_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__6_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_u_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_0_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_1_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_2_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_3_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_4_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_5_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_6_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_u_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_0_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_1_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: s_mov_b32 s15, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: s_mov_b32 s15, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_2_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_3_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_4_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_5_6() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__u_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__0_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__1_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__2_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__3_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__4_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__5_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__6_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_u_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_0_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_1_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_2_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_3_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_4_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_5_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_6_7_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_u_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_0_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_1_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_2_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_3_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_4_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_5_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} + +define void @s_shuffle_v4i32_v4i32__7_7_6_7() { +; GFX900-LABEL: s_shuffle_v4i32_v4i32__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i32_v4i32__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i32_v4i32__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i32> asm "; def $0", "=s"() + %vec1 = call <4 x i32> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i32> %vec0, <4 x i32> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x i32> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll new file mode 100644 index 0000000000000..86bac343f7911 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll @@ -0,0 +1,7310 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4i64_v2i64__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i64_v2i64__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> poison + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i64_v2i64__2_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> zeroinitializer + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v10, v0 +; GFX940-NEXT: v_mov_b32_e32 v11, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: v_mov_b32_e32 v10, v2 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i64_v2i64__u_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i64_v2i64__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v2i64__3_3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v2i64__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v2i64__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v2i64__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=v"() + %vec1 = call <2 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @s_shuffle_v4i64_v2i64__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_0_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_1_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_2_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_0_u() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_1_u() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_2_u() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_3_u() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_3_0() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_3_1() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_3_2() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_u_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_1_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_2_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_u_0() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_1_0() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_2_0() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_u_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_0_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_2_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_u_1() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_0_1() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_2_1() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_u_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_0_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_1_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_u_2() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_0_2() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_1_2() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_u_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_0_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_1_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_2_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_u_3() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_0_3() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_1_3() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v2i64__3_3_2_3() { +; GFX900-LABEL: s_shuffle_v4i64_v2i64__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v2i64__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v2i64__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x i64> asm "; def $0", "=s"() + %vec1 = call <2 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll new file mode 100644 index 0000000000000..5e0950729881b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll @@ -0,0 +1,16014 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4i64_v3i64__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i64_v3i64__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> poison + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i64_v3i64__3_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> zeroinitializer + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i64_v3i64__u_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i64_v3i64__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, v8 +; GFX940-NEXT: v_mov_b32_e32 v11, v9 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v3i64__5_5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=v"() + %vec1 = call <3 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @s_shuffle_v4i64_v3i64__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_0_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_1_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_2_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_3_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_4_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_0_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_1_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_2_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_3_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_4_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_5_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_5_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_5_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_5_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_5_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s12 +; GFX940-NEXT: s_mov_b32 s5, s13 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_5_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_u_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_1_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_2_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_3_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_4_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_u_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_1_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_2_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s12 +; GFX940-NEXT: s_mov_b32 s5, s13 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_3_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: s_mov_b32 s4, s12 +; GFX940-NEXT: s_mov_b32 s5, s13 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_4_0() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_u_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_0_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_2_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_3_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_4_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_u_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_0_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_2_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_3_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_4_1() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_u_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_0_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_1_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_3_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_4_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_u_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_0_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_1_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_3_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: s_mov_b32 s4, s12 +; GFX940-NEXT: s_mov_b32 s5, s13 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_4_2() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_u_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_0_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_1_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_2_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_4_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_u_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_0_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_1_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_2_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_4_3() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__u_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__4_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_u_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_0_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_1_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_2_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_3_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_u_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_0_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_1_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_2_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_3_4() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_u_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_0_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_1_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_2_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_3_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_4_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_u_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_0_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_1_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_2_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_3_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__5_5_4_5() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %vec1 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll new file mode 100644 index 0000000000000..1c0388740a831 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll @@ -0,0 +1,27249 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4i64_v4i64__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i64_v4i64__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> poison + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i64_v4i64__4_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__6_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> zeroinitializer + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__6_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__6_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v10, v2 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v12 +; GFX900-NEXT: v_mov_b32_e32 v1, v13 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v12 +; GFX90A-NEXT: v_mov_b32_e32 v1, v13 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v12 +; GFX940-NEXT: v_mov_b32_e32 v1, v13 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__6_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v12, v4 +; GFX940-NEXT: v_mov_b32_e32 v13, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v12 +; GFX900-NEXT: v_mov_b32_e32 v1, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v12 +; GFX90A-NEXT: v_mov_b32_e32 v1, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v12 +; GFX940-NEXT: v_mov_b32_e32 v1, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v12 +; GFX900-NEXT: v_mov_b32_e32 v1, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v12 +; GFX90A-NEXT: v_mov_b32_e32 v1, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v12 +; GFX940-NEXT: v_mov_b32_e32 v1, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v12 +; GFX940-NEXT: v_mov_b32_e32 v9, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__6_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v14, v6 +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v14, v6 +; GFX90A-NEXT: v_mov_b32_e32 v15, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v14, v6 +; GFX940-NEXT: v_mov_b32_e32 v15, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v14 +; GFX940-NEXT: v_mov_b32_e32 v9, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v14 +; GFX900-NEXT: v_mov_b32_e32 v11, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v10, v14 +; GFX90A-NEXT: v_mov_b32_e32 v11, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v10, v14 +; GFX940-NEXT: v_mov_b32_e32 v11, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v10, v6 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v12 +; GFX900-NEXT: v_mov_b32_e32 v5, v13 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v12 +; GFX90A-NEXT: v_mov_b32_e32 v5, v13 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v12 +; GFX940-NEXT: v_mov_b32_e32 v5, v13 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i64_v4i64__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4i64_v4i64__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__6_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v8 +; GFX940-NEXT: v_mov_b32_e32 v11, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v10, v0 +; GFX940-NEXT: v_mov_b32_e32 v11, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v8 +; GFX940-NEXT: v_mov_b32_e32 v11, v9 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, v10 +; GFX940-NEXT: v_mov_b32_e32 v13, v11 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__6_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v8 +; GFX940-NEXT: v_mov_b32_e32 v11, v9 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v10 +; GFX940-NEXT: v_mov_b32_e32 v13, v11 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__u_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__0_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__1_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v8 +; GFX940-NEXT: v_mov_b32_e32 v11, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__2_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v10 +; GFX940-NEXT: v_mov_b32_e32 v13, v11 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__3_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v12 +; GFX900-NEXT: v_mov_b32_e32 v15, v13 +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, v12 +; GFX90A-NEXT: v_mov_b32_e32 v15, v13 +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, v12 +; GFX940-NEXT: v_mov_b32_e32 v15, v13 +; GFX940-NEXT: v_mov_b32_e32 v10, v6 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__4_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__5_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__6_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v6 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__u_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__0_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__1_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__2_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__3_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__4_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__5_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__6_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4i64_v4i64__7_7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4i64_v4i64__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4i64_v4i64__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4i64_v4i64__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=v"() + %vec1 = call <4 x i64> asm "; def $0", "=v"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + store <4 x i64> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @s_shuffle_v4i64_v4i64__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__6_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_0_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_1_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_2_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_3_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_4_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_5_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_6_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_0_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_1_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_2_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_3_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_4_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_6_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_7_u() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_7_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_7_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_7_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_7_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_7_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_7_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_7_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__6_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_0_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_u_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_1_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_2_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_3_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_4_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_5_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_6_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_0_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_u_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_1_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_2_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_3_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_4_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_6_0() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__6_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_1_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_u_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_0_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_2_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_3_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_4_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_5_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_6_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_1_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_u_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_0_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_2_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_3_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_4_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_6_1() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__6_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_2_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_u_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_0_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_1_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_3_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_4_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_5_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_6_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_2_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_u_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_0_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_1_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_3_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_4_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: s_mov_b32 s20, s12 +; GFX900-NEXT: s_mov_b32 s21, s13 +; GFX900-NEXT: s_mov_b32 s22, s8 +; GFX900-NEXT: s_mov_b32 s23, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: s_mov_b32 s20, s12 +; GFX90A-NEXT: s_mov_b32 s21, s13 +; GFX90A-NEXT: s_mov_b32 s22, s8 +; GFX90A-NEXT: s_mov_b32 s23, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: s_mov_b32 s16, s8 +; GFX940-NEXT: s_mov_b32 s17, s9 +; GFX940-NEXT: s_mov_b32 s18, s4 +; GFX940-NEXT: s_mov_b32 s19, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: s_mov_b32 s20, s14 +; GFX900-NEXT: s_mov_b32 s21, s15 +; GFX900-NEXT: s_mov_b32 s22, s8 +; GFX900-NEXT: s_mov_b32 s23, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: s_mov_b32 s20, s14 +; GFX90A-NEXT: s_mov_b32 s21, s15 +; GFX90A-NEXT: s_mov_b32 s22, s8 +; GFX90A-NEXT: s_mov_b32 s23, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: s_mov_b32 s16, s10 +; GFX940-NEXT: s_mov_b32 s17, s11 +; GFX940-NEXT: s_mov_b32 s18, s4 +; GFX940-NEXT: s_mov_b32 s19, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_6_2() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__6_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_3_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_u_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_0_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_1_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_2_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_4_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_5_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_6_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_3_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_u_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_0_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_1_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_2_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_4_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_6_3() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__u_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__4_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__6_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_4_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_u_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_0_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_1_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_2_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_3_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_5_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_6_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_4_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_u_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_0_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_1_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_2_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_3_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_6_4() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: s_mov_b32 s18, s14 +; GFX900-NEXT: s_mov_b32 s19, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: s_mov_b32 s18, s14 +; GFX90A-NEXT: s_mov_b32 s19, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: s_mov_b32 s14, s10 +; GFX940-NEXT: s_mov_b32 s15, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: s_mov_b32 s18, s14 +; GFX900-NEXT: s_mov_b32 s19, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: s_mov_b32 s18, s14 +; GFX90A-NEXT: s_mov_b32 s19, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: s_mov_b32 s14, s10 +; GFX940-NEXT: s_mov_b32 s15, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__6_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_5_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_u_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_0_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_1_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_2_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_3_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_4_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_6_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_u_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_0_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_1_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_2_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_3_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_4_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_6_5() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__u_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__0_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__1_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s10, s8 +; GFX940-NEXT: s_mov_b32 s11, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__2_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s18, s16 +; GFX900-NEXT: s_mov_b32 s19, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s18, s16 +; GFX90A-NEXT: s_mov_b32 s19, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s12 +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: s_mov_b32 s14, s12 +; GFX940-NEXT: s_mov_b32 s15, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__3_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s18, s16 +; GFX900-NEXT: s_mov_b32 s19, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s18, s16 +; GFX90A-NEXT: s_mov_b32 s19, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s12 +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: s_mov_b32 s14, s12 +; GFX940-NEXT: s_mov_b32 s15, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__4_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__5_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__6_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_6_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_u_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_0_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s10, s8 +; GFX940-NEXT: s_mov_b32 s11, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_1_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_2_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s18, s16 +; GFX900-NEXT: s_mov_b32 s19, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s18, s16 +; GFX90A-NEXT: s_mov_b32 s19, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s14, s12 +; GFX940-NEXT: s_mov_b32 s15, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_3_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s18, s16 +; GFX900-NEXT: s_mov_b32 s19, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s18, s16 +; GFX90A-NEXT: s_mov_b32 s19, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s14, s12 +; GFX940-NEXT: s_mov_b32 s15, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_4_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_5_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_6_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_u_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_0_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: s_mov_b32 s18, s14 +; GFX900-NEXT: s_mov_b32 s19, s15 +; GFX900-NEXT: s_mov_b32 s20, s4 +; GFX900-NEXT: s_mov_b32 s21, s5 +; GFX900-NEXT: s_mov_b32 s22, s12 +; GFX900-NEXT: s_mov_b32 s23, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: s_mov_b32 s18, s14 +; GFX90A-NEXT: s_mov_b32 s19, s15 +; GFX90A-NEXT: s_mov_b32 s20, s4 +; GFX90A-NEXT: s_mov_b32 s21, s5 +; GFX90A-NEXT: s_mov_b32 s22, s12 +; GFX90A-NEXT: s_mov_b32 s23, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s16, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: s_mov_b32 s14, s10 +; GFX940-NEXT: s_mov_b32 s15, s11 +; GFX940-NEXT: s_mov_b32 s17, s1 +; GFX940-NEXT: s_mov_b32 s18, s8 +; GFX940-NEXT: s_mov_b32 s19, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_1_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: s_mov_b32 s18, s14 +; GFX900-NEXT: s_mov_b32 s19, s15 +; GFX900-NEXT: s_mov_b32 s20, s6 +; GFX900-NEXT: s_mov_b32 s21, s7 +; GFX900-NEXT: s_mov_b32 s22, s12 +; GFX900-NEXT: s_mov_b32 s23, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: s_mov_b32 s18, s14 +; GFX90A-NEXT: s_mov_b32 s19, s15 +; GFX90A-NEXT: s_mov_b32 s20, s6 +; GFX90A-NEXT: s_mov_b32 s21, s7 +; GFX90A-NEXT: s_mov_b32 s22, s12 +; GFX90A-NEXT: s_mov_b32 s23, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s16, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: s_mov_b32 s14, s10 +; GFX940-NEXT: s_mov_b32 s15, s11 +; GFX940-NEXT: s_mov_b32 s17, s3 +; GFX940-NEXT: s_mov_b32 s18, s8 +; GFX940-NEXT: s_mov_b32 s19, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_2_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_3_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_4_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_6() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__u_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__0_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__1_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__2_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__3_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__4_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__5_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__6_7_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_u_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_0_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_1_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_2_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_3_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_4_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_5_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_6_7_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_u_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_0_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_1_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_2_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s6, s14 +; GFX940-NEXT: s_mov_b32 s7, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_3_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_4_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_5_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v4i64__7_7_6_7() { +; GFX900-LABEL: s_shuffle_v4i64_v4i64__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v4i64__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v4i64__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x i64> asm "; def $0", "=s"() + %vec1 = call <4 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <4 x i64> %vec0, <4 x i64> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x i64> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll new file mode 100644 index 0000000000000..2da63cfc82e22 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll @@ -0,0 +1,7310 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4p0_v2p0__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p0_v2p0__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> poison + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p0_v2p0__2_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> zeroinitializer + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v10, v0 +; GFX940-NEXT: v_mov_b32_e32 v11, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: v_mov_b32_e32 v10, v2 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p0_v2p0__u_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p0_v2p0__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v2p0__3_3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v2p0__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v2p0__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v2p0__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=v"() + %vec1 = call <2 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @s_shuffle_v4p0_v2p0__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_0_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_1_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_2_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_0_u() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_1_u() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_2_u() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_3_u() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_3_0() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_3_1() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_3_2() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_u_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_1_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_2_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_u_0() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_1_0() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_2_0() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_u_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_0_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_2_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_u_1() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_0_1() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_2_1() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_u_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_0_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_1_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_u_2() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_0_2() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_1_2() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_u_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_0_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_1_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_2_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_u_3() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_0_3() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_1_3() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v2p0__3_3_2_3() { +; GFX900-LABEL: s_shuffle_v4p0_v2p0__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v2p0__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v2p0__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr> asm "; def $0", "=s"() + %vec1 = call <2 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr> %vec0, <2 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll new file mode 100644 index 0000000000000..962da96c308d1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll @@ -0,0 +1,16014 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4p0_v3p0__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p0_v3p0__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> poison + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p0_v3p0__3_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> zeroinitializer + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p0_v3p0__u_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p0_v3p0__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, v8 +; GFX940-NEXT: v_mov_b32_e32 v11, v9 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v3p0__5_5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=v"() + %vec1 = call <3 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @s_shuffle_v4p0_v3p0__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_0_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_1_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_2_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_3_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_4_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_0_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_1_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_2_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_3_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_4_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_5_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_5_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_5_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_5_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_5_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s12 +; GFX940-NEXT: s_mov_b32 s5, s13 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_5_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_u_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_1_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_2_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_3_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_4_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_u_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_1_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_2_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s12 +; GFX940-NEXT: s_mov_b32 s5, s13 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_3_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: s_mov_b32 s4, s12 +; GFX940-NEXT: s_mov_b32 s5, s13 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_4_0() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_u_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_0_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_2_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_3_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_4_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_u_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_0_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_2_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_3_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_4_1() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_u_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_0_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_1_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_3_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_4_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_u_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_0_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_1_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_3_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: s_mov_b32 s8, s16 +; GFX900-NEXT: s_mov_b32 s9, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: s_mov_b32 s8, s16 +; GFX90A-NEXT: s_mov_b32 s9, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: s_mov_b32 s4, s12 +; GFX940-NEXT: s_mov_b32 s5, s13 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_4_2() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[16:21] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s20 +; GFX900-NEXT: s_mov_b32 s5, s21 +; GFX900-NEXT: s_mov_b32 s6, s20 +; GFX900-NEXT: s_mov_b32 s7, s21 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[16:21] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s20 +; GFX90A-NEXT: s_mov_b32 s5, s21 +; GFX90A-NEXT: s_mov_b32 s6, s20 +; GFX90A-NEXT: s_mov_b32 s7, s21 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[12:17] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s16 +; GFX940-NEXT: s_mov_b32 s1, s17 +; GFX940-NEXT: s_mov_b32 s2, s16 +; GFX940-NEXT: s_mov_b32 s3, s17 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_u_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_0_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_1_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_2_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_4_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_u_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_0_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_1_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_2_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_4_3() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__u_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__4_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_u_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_0_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_1_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_2_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_3_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_u_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_0_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_1_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_2_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_3_4() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_u_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_0_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_1_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_2_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_3_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_4_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_u_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_0_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_1_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_2_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:17] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:17] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_3_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__5_5_4_5() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %vec1 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll new file mode 100644 index 0000000000000..db929db38b5a9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll @@ -0,0 +1,27249 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4p0_v4p0__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p0_v4p0__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> poison + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p0_v4p0__4_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__6_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> zeroinitializer + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__6_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__6_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v2 +; GFX900-NEXT: v_mov_b32_e32 v11, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v11, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v10, v2 +; GFX940-NEXT: v_mov_b32_e32 v11, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v0, v12 +; GFX900-NEXT: v_mov_b32_e32 v1, v13 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v0, v12 +; GFX90A-NEXT: v_mov_b32_e32 v1, v13 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v12 +; GFX940-NEXT: v_mov_b32_e32 v1, v13 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: v_mov_b32_e32 v9, v3 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v9, v3 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v2 +; GFX940-NEXT: v_mov_b32_e32 v9, v3 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v8 +; GFX900-NEXT: v_mov_b32_e32 v1, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v9 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__6_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v12, v4 +; GFX900-NEXT: v_mov_b32_e32 v13, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v12, v4 +; GFX90A-NEXT: v_mov_b32_e32 v13, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v12, v4 +; GFX940-NEXT: v_mov_b32_e32 v13, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v12 +; GFX900-NEXT: v_mov_b32_e32 v1, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v12 +; GFX90A-NEXT: v_mov_b32_e32 v1, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v12 +; GFX940-NEXT: v_mov_b32_e32 v1, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v12 +; GFX900-NEXT: v_mov_b32_e32 v1, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v12 +; GFX90A-NEXT: v_mov_b32_e32 v1, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v12 +; GFX940-NEXT: v_mov_b32_e32 v1, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v12 +; GFX900-NEXT: v_mov_b32_e32 v9, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v12 +; GFX90A-NEXT: v_mov_b32_e32 v9, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v8, v12 +; GFX940-NEXT: v_mov_b32_e32 v9, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v10 +; GFX900-NEXT: v_mov_b32_e32 v3, v11 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v10 +; GFX90A-NEXT: v_mov_b32_e32 v3, v11 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v10 +; GFX940-NEXT: v_mov_b32_e32 v3, v11 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__6_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v14, v6 +; GFX900-NEXT: v_mov_b32_e32 v15, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v14, v6 +; GFX90A-NEXT: v_mov_b32_e32 v15, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v14, v6 +; GFX940-NEXT: v_mov_b32_e32 v15, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v14 +; GFX900-NEXT: v_mov_b32_e32 v3, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v14 +; GFX90A-NEXT: v_mov_b32_e32 v3, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v14 +; GFX940-NEXT: v_mov_b32_e32 v3, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v14 +; GFX900-NEXT: v_mov_b32_e32 v1, v15 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v14 +; GFX90A-NEXT: v_mov_b32_e32 v1, v15 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v14 +; GFX940-NEXT: v_mov_b32_e32 v1, v15 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v8, v14 +; GFX900-NEXT: v_mov_b32_e32 v9, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v14 +; GFX90A-NEXT: v_mov_b32_e32 v9, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v14 +; GFX940-NEXT: v_mov_b32_e32 v9, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v10, v14 +; GFX900-NEXT: v_mov_b32_e32 v11, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v10, v14 +; GFX90A-NEXT: v_mov_b32_e32 v11, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v10, v14 +; GFX940-NEXT: v_mov_b32_e32 v11, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v10, v6 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v12 +; GFX900-NEXT: v_mov_b32_e32 v5, v13 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v12 +; GFX90A-NEXT: v_mov_b32_e32 v5, v13 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v12 +; GFX940-NEXT: v_mov_b32_e32 v5, v13 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p0_v4p0__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p0_v4p0__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__6_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v8 +; GFX940-NEXT: v_mov_b32_e32 v11, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v0 +; GFX900-NEXT: v_mov_b32_e32 v9, v1 +; GFX900-NEXT: v_mov_b32_e32 v10, v0 +; GFX900-NEXT: v_mov_b32_e32 v11, v1 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v1 +; GFX90A-NEXT: v_mov_b32_e32 v10, v0 +; GFX90A-NEXT: v_mov_b32_e32 v11, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v9, v1 +; GFX940-NEXT: v_mov_b32_e32 v10, v0 +; GFX940-NEXT: v_mov_b32_e32 v11, v1 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v8 +; GFX940-NEXT: v_mov_b32_e32 v11, v9 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, v10 +; GFX940-NEXT: v_mov_b32_e32 v13, v11 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__6_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v8 +; GFX940-NEXT: v_mov_b32_e32 v11, v9 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v10 +; GFX940-NEXT: v_mov_b32_e32 v13, v11 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__u_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__0_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v6 +; GFX900-NEXT: v_mov_b32_e32 v9, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v6 +; GFX90A-NEXT: v_mov_b32_e32 v9, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v6 +; GFX940-NEXT: v_mov_b32_e32 v9, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__1_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v8 +; GFX900-NEXT: v_mov_b32_e32 v11, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v8 +; GFX90A-NEXT: v_mov_b32_e32 v11, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v8 +; GFX940-NEXT: v_mov_b32_e32 v11, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__2_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v10 +; GFX900-NEXT: v_mov_b32_e32 v13, v11 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v10 +; GFX90A-NEXT: v_mov_b32_e32 v13, v11 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v10 +; GFX940-NEXT: v_mov_b32_e32 v13, v11 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__3_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, v12 +; GFX900-NEXT: v_mov_b32_e32 v15, v13 +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, v12 +; GFX90A-NEXT: v_mov_b32_e32 v15, v13 +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, v12 +; GFX940-NEXT: v_mov_b32_e32 v15, v13 +; GFX940-NEXT: v_mov_b32_e32 v10, v6 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__4_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__5_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__6_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v4 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v4 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v10, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v4 +; GFX900-NEXT: v_mov_b32_e32 v9, v5 +; GFX900-NEXT: v_mov_b32_e32 v10, v4 +; GFX900-NEXT: v_mov_b32_e32 v11, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v5 +; GFX90A-NEXT: v_mov_b32_e32 v10, v4 +; GFX90A-NEXT: v_mov_b32_e32 v11, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v5 +; GFX940-NEXT: v_mov_b32_e32 v10, v4 +; GFX940-NEXT: v_mov_b32_e32 v11, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v8 +; GFX900-NEXT: v_mov_b32_e32 v5, v9 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v8 +; GFX90A-NEXT: v_mov_b32_e32 v5, v9 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v8 +; GFX940-NEXT: v_mov_b32_e32 v5, v9 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v10 +; GFX900-NEXT: v_mov_b32_e32 v7, v11 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v10 +; GFX90A-NEXT: v_mov_b32_e32 v7, v11 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v10 +; GFX940-NEXT: v_mov_b32_e32 v7, v11 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, v6 +; GFX900-NEXT: v_mov_b32_e32 v11, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, v6 +; GFX90A-NEXT: v_mov_b32_e32 v11, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v6 +; GFX940-NEXT: v_mov_b32_e32 v11, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__u_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__0_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__1_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__2_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__3_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__4_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__5_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__6_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: v_mov_b32_e32 v0, v10 +; GFX900-NEXT: v_mov_b32_e32 v1, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: v_mov_b32_e32 v0, v10 +; GFX90A-NEXT: v_mov_b32_e32 v1, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: v_mov_b32_e32 v0, v10 +; GFX940-NEXT: v_mov_b32_e32 v1, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: v_mov_b32_e32 v2, v12 +; GFX900-NEXT: v_mov_b32_e32 v3, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: v_mov_b32_e32 v2, v12 +; GFX90A-NEXT: v_mov_b32_e32 v3, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: v_mov_b32_e32 v2, v12 +; GFX940-NEXT: v_mov_b32_e32 v3, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: v_mov_b32_e32 v4, v14 +; GFX900-NEXT: v_mov_b32_e32 v5, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: v_mov_b32_e32 v4, v14 +; GFX90A-NEXT: v_mov_b32_e32 v5, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: v_mov_b32_e32 v4, v14 +; GFX940-NEXT: v_mov_b32_e32 v5, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v8 +; GFX900-NEXT: v_mov_b32_e32 v3, v9 +; GFX900-NEXT: v_mov_b32_e32 v6, v8 +; GFX900-NEXT: v_mov_b32_e32 v7, v9 +; GFX900-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v9 +; GFX90A-NEXT: v_mov_b32_e32 v6, v8 +; GFX90A-NEXT: v_mov_b32_e32 v7, v9 +; GFX90A-NEXT: global_store_dwordx4 v10, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:9] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v8 +; GFX940-NEXT: v_mov_b32_e32 v3, v9 +; GFX940-NEXT: v_mov_b32_e32 v6, v8 +; GFX940-NEXT: v_mov_b32_e32 v7, v9 +; GFX940-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v10 +; GFX900-NEXT: v_mov_b32_e32 v5, v11 +; GFX900-NEXT: v_mov_b32_e32 v8, v10 +; GFX900-NEXT: v_mov_b32_e32 v9, v11 +; GFX900-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v10 +; GFX90A-NEXT: v_mov_b32_e32 v5, v11 +; GFX90A-NEXT: v_mov_b32_e32 v8, v10 +; GFX90A-NEXT: v_mov_b32_e32 v9, v11 +; GFX90A-NEXT: global_store_dwordx4 v12, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v12, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v10 +; GFX940-NEXT: v_mov_b32_e32 v5, v11 +; GFX940-NEXT: v_mov_b32_e32 v8, v10 +; GFX940-NEXT: v_mov_b32_e32 v9, v11 +; GFX940-NEXT: global_store_dwordx4 v12, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[6:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v14, 0 +; GFX900-NEXT: v_mov_b32_e32 v6, v12 +; GFX900-NEXT: v_mov_b32_e32 v7, v13 +; GFX900-NEXT: v_mov_b32_e32 v10, v12 +; GFX900-NEXT: v_mov_b32_e32 v11, v13 +; GFX900-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v12 +; GFX90A-NEXT: v_mov_b32_e32 v7, v13 +; GFX90A-NEXT: v_mov_b32_e32 v10, v12 +; GFX90A-NEXT: v_mov_b32_e32 v11, v13 +; GFX90A-NEXT: global_store_dwordx4 v14, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v14, v[10:13], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v14, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:13] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v6, v12 +; GFX940-NEXT: v_mov_b32_e32 v7, v13 +; GFX940-NEXT: v_mov_b32_e32 v10, v12 +; GFX940-NEXT: v_mov_b32_e32 v11, v13 +; GFX940-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v16, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v12, v6 +; GFX900-NEXT: v_mov_b32_e32 v13, v7 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v12, v14 +; GFX900-NEXT: v_mov_b32_e32 v13, v15 +; GFX900-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v12, v6 +; GFX90A-NEXT: v_mov_b32_e32 v13, v7 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, v14 +; GFX90A-NEXT: v_mov_b32_e32 v13, v15 +; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v16, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v12, v6 +; GFX940-NEXT: v_mov_b32_e32 v13, v7 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v12, v14 +; GFX940-NEXT: v_mov_b32_e32 v13, v15 +; GFX940-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @v_shuffle_v4p0_v4p0__7_7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p0_v4p0__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p0_v4p0__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p0_v4p0__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v4, v6 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=v"() + %vec1 = call <4 x ptr> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + store <4 x ptr> %shuf, ptr addrspace(1) %ptr, align 32 + ret void +} + +define void @s_shuffle_v4p0_v4p0__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__6_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_0_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_1_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_2_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_3_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_4_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_5_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_6_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_0_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_1_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_2_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_3_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_4_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_6_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_7_u() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_7_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_7_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_7_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_7_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_7_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_7_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_7_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: s_mov_b32 s8, s4 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: s_mov_b32 s8, s4 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: s_mov_b32 s4, s0 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__6_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_u_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_1_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_2_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_3_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_4_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_5_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_6_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_0_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_u_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_1_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_2_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_3_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_4_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_6_0() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__6_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s12 +; GFX900-NEXT: s_mov_b32 s5, s13 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s12 +; GFX90A-NEXT: s_mov_b32 s5, s13 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s8 +; GFX940-NEXT: s_mov_b32 s1, s9 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_u_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_0_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_2_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_3_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s18 +; GFX900-NEXT: s_mov_b32 s9, s19 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s18 +; GFX90A-NEXT: s_mov_b32 s9, s19 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s14 +; GFX940-NEXT: s_mov_b32 s5, s15 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_4_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_5_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_6_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_1_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_u_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_0_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_2_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_3_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_4_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_6_1() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__6_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_u_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_0_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_1_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_3_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_4_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_5_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_6_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_2_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_u_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_0_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_1_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_3_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_4_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: s_mov_b32 s20, s12 +; GFX900-NEXT: s_mov_b32 s21, s13 +; GFX900-NEXT: s_mov_b32 s22, s8 +; GFX900-NEXT: s_mov_b32 s23, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: s_mov_b32 s20, s12 +; GFX90A-NEXT: s_mov_b32 s21, s13 +; GFX90A-NEXT: s_mov_b32 s22, s8 +; GFX90A-NEXT: s_mov_b32 s23, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: s_mov_b32 s16, s8 +; GFX940-NEXT: s_mov_b32 s17, s9 +; GFX940-NEXT: s_mov_b32 s18, s4 +; GFX940-NEXT: s_mov_b32 s19, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: s_mov_b32 s20, s14 +; GFX900-NEXT: s_mov_b32 s21, s15 +; GFX900-NEXT: s_mov_b32 s22, s8 +; GFX900-NEXT: s_mov_b32 s23, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: s_mov_b32 s20, s14 +; GFX90A-NEXT: s_mov_b32 s21, s15 +; GFX90A-NEXT: s_mov_b32 s22, s8 +; GFX90A-NEXT: s_mov_b32 s23, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: s_mov_b32 s16, s10 +; GFX940-NEXT: s_mov_b32 s17, s11 +; GFX940-NEXT: s_mov_b32 s18, s4 +; GFX940-NEXT: s_mov_b32 s19, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_6_2() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__6_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s16 +; GFX900-NEXT: s_mov_b32 s5, s17 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s16 +; GFX90A-NEXT: s_mov_b32 s5, s17 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s12 +; GFX940-NEXT: s_mov_b32 s1, s13 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_u_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_0_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_1_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_2_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_4_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_5_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_6_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s16 +; GFX900-NEXT: s_mov_b32 s7, s17 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s16 +; GFX90A-NEXT: s_mov_b32 s7, s17 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s12 +; GFX940-NEXT: s_mov_b32 s3, s13 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_3_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_u_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_0_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_1_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_2_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_4_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_6_3() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__u_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__4_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__6_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_u_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_0_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_1_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_2_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_3_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_5_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_6_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_4_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_u_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_0_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_1_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_2_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_3_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_6_4() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s4 +; GFX900-NEXT: s_mov_b32 s19, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s4 +; GFX90A-NEXT: s_mov_b32 s19, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: s_mov_b32 s18, s14 +; GFX900-NEXT: s_mov_b32 s19, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: s_mov_b32 s18, s14 +; GFX90A-NEXT: s_mov_b32 s19, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: s_mov_b32 s14, s10 +; GFX940-NEXT: s_mov_b32 s15, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: s_mov_b32 s18, s14 +; GFX900-NEXT: s_mov_b32 s19, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: s_mov_b32 s18, s14 +; GFX90A-NEXT: s_mov_b32 s19, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: s_mov_b32 s14, s10 +; GFX940-NEXT: s_mov_b32 s15, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__6_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_u_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_0_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_1_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_2_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_3_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_4_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_6_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_u_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_0_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_1_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_2_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_3_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_4_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s6 +; GFX900-NEXT: s_mov_b32 s15, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s6 +; GFX90A-NEXT: s_mov_b32 s15, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: s_mov_b32 s10, s2 +; GFX940-NEXT: s_mov_b32 s11, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_6_5() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s6 +; GFX900-NEXT: s_mov_b32 s19, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s6 +; GFX90A-NEXT: s_mov_b32 s19, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s2 +; GFX940-NEXT: s_mov_b32 s15, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__u_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__0_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__1_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: s_mov_b32 s10, s8 +; GFX940-NEXT: s_mov_b32 s11, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__2_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s18, s16 +; GFX900-NEXT: s_mov_b32 s19, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s18, s16 +; GFX90A-NEXT: s_mov_b32 s19, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s12 +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: s_mov_b32 s14, s12 +; GFX940-NEXT: s_mov_b32 s15, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__3_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s15, s17 +; GFX900-NEXT: s_mov_b32 s18, s16 +; GFX900-NEXT: s_mov_b32 s19, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s16 +; GFX90A-NEXT: s_mov_b32 s15, s17 +; GFX90A-NEXT: s_mov_b32 s18, s16 +; GFX90A-NEXT: s_mov_b32 s19, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s12 +; GFX940-NEXT: s_mov_b32 s11, s13 +; GFX940-NEXT: s_mov_b32 s14, s12 +; GFX940-NEXT: s_mov_b32 s15, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__4_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__5_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__6_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_u_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_0_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s10, s8 +; GFX940-NEXT: s_mov_b32 s11, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_1_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s8 +; GFX940-NEXT: s_mov_b32 s5, s9 +; GFX940-NEXT: s_mov_b32 s6, s8 +; GFX940-NEXT: s_mov_b32 s7, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_2_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s18, s16 +; GFX900-NEXT: s_mov_b32 s19, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s18, s16 +; GFX90A-NEXT: s_mov_b32 s19, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s14, s12 +; GFX940-NEXT: s_mov_b32 s15, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_3_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s18, s16 +; GFX900-NEXT: s_mov_b32 s19, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s18, s16 +; GFX90A-NEXT: s_mov_b32 s19, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s14, s12 +; GFX940-NEXT: s_mov_b32 s15, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_4_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s8 +; GFX900-NEXT: s_mov_b32 s17, s9 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s8 +; GFX90A-NEXT: s_mov_b32 s17, s9 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s4 +; GFX940-NEXT: s_mov_b32 s13, s5 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_5_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_6_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_u_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s10, s8 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s10, s8 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s6, s4 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_0_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: s_mov_b32 s18, s14 +; GFX900-NEXT: s_mov_b32 s19, s15 +; GFX900-NEXT: s_mov_b32 s20, s4 +; GFX900-NEXT: s_mov_b32 s21, s5 +; GFX900-NEXT: s_mov_b32 s22, s12 +; GFX900-NEXT: s_mov_b32 s23, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: s_mov_b32 s18, s14 +; GFX90A-NEXT: s_mov_b32 s19, s15 +; GFX90A-NEXT: s_mov_b32 s20, s4 +; GFX90A-NEXT: s_mov_b32 s21, s5 +; GFX90A-NEXT: s_mov_b32 s22, s12 +; GFX90A-NEXT: s_mov_b32 s23, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s16, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: s_mov_b32 s14, s10 +; GFX940-NEXT: s_mov_b32 s15, s11 +; GFX940-NEXT: s_mov_b32 s17, s1 +; GFX940-NEXT: s_mov_b32 s18, s8 +; GFX940-NEXT: s_mov_b32 s19, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_1_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s16, s14 +; GFX900-NEXT: s_mov_b32 s17, s15 +; GFX900-NEXT: s_mov_b32 s18, s14 +; GFX900-NEXT: s_mov_b32 s19, s15 +; GFX900-NEXT: s_mov_b32 s20, s6 +; GFX900-NEXT: s_mov_b32 s21, s7 +; GFX900-NEXT: s_mov_b32 s22, s12 +; GFX900-NEXT: s_mov_b32 s23, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[16:23] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s16, s14 +; GFX90A-NEXT: s_mov_b32 s17, s15 +; GFX90A-NEXT: s_mov_b32 s18, s14 +; GFX90A-NEXT: s_mov_b32 s19, s15 +; GFX90A-NEXT: s_mov_b32 s20, s6 +; GFX90A-NEXT: s_mov_b32 s21, s7 +; GFX90A-NEXT: s_mov_b32 s22, s12 +; GFX90A-NEXT: s_mov_b32 s23, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[16:23] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s16, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s12, s10 +; GFX940-NEXT: s_mov_b32 s13, s11 +; GFX940-NEXT: s_mov_b32 s14, s10 +; GFX940-NEXT: s_mov_b32 s15, s11 +; GFX940-NEXT: s_mov_b32 s17, s3 +; GFX940-NEXT: s_mov_b32 s18, s8 +; GFX940-NEXT: s_mov_b32 s19, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[12:19] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_2_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_3_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s16 +; GFX900-NEXT: s_mov_b32 s11, s17 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s16 +; GFX90A-NEXT: s_mov_b32 s11, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s12 +; GFX940-NEXT: s_mov_b32 s7, s13 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_4_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_6() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s8 +; GFX900-NEXT: s_mov_b32 s19, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s8 +; GFX90A-NEXT: s_mov_b32 s19, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__u_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__0_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s14 +; GFX900-NEXT: s_mov_b32 s7, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s14 +; GFX90A-NEXT: s_mov_b32 s7, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s10 +; GFX940-NEXT: s_mov_b32 s3, s11 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__1_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__2_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s8 +; GFX900-NEXT: s_mov_b32 s13, s9 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s8 +; GFX90A-NEXT: s_mov_b32 s13, s9 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__3_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__4_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__5_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__6_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s8 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s8 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s4 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_u_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_0_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_mov_b32 s13, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s14 +; GFX90A-NEXT: s_mov_b32 s13, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: s_mov_b32 s8, s10 +; GFX940-NEXT: s_mov_b32 s9, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_1_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s15 +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s15 +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s11 +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_2_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_3_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s18 +; GFX900-NEXT: s_mov_b32 s17, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s18 +; GFX90A-NEXT: s_mov_b32 s17, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s14 +; GFX940-NEXT: s_mov_b32 s13, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_4_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_5_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_6_7_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_u_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_0_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s0 +; GFX940-NEXT: s_mov_b32 s9, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_1_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s14 +; GFX900-NEXT: s_mov_b32 s9, s15 +; GFX900-NEXT: s_mov_b32 s10, s14 +; GFX900-NEXT: s_mov_b32 s11, s15 +; GFX900-NEXT: s_mov_b32 s12, s6 +; GFX900-NEXT: s_mov_b32 s13, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s14 +; GFX90A-NEXT: s_mov_b32 s9, s15 +; GFX90A-NEXT: s_mov_b32 s10, s14 +; GFX90A-NEXT: s_mov_b32 s11, s15 +; GFX90A-NEXT: s_mov_b32 s12, s6 +; GFX90A-NEXT: s_mov_b32 s13, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s10 +; GFX940-NEXT: s_mov_b32 s5, s11 +; GFX940-NEXT: s_mov_b32 s6, s10 +; GFX940-NEXT: s_mov_b32 s7, s11 +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_2_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s18 +; GFX900-NEXT: s_mov_b32 s5, s19 +; GFX900-NEXT: s_mov_b32 s6, s18 +; GFX900-NEXT: s_mov_b32 s7, s19 +; GFX900-NEXT: s_mov_b32 s10, s18 +; GFX900-NEXT: s_mov_b32 s11, s19 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s18 +; GFX90A-NEXT: s_mov_b32 s5, s19 +; GFX90A-NEXT: s_mov_b32 s6, s18 +; GFX90A-NEXT: s_mov_b32 s7, s19 +; GFX90A-NEXT: s_mov_b32 s10, s18 +; GFX90A-NEXT: s_mov_b32 s11, s19 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s14 +; GFX940-NEXT: s_mov_b32 s1, s15 +; GFX940-NEXT: s_mov_b32 s2, s14 +; GFX940-NEXT: s_mov_b32 s3, s15 +; GFX940-NEXT: s_mov_b32 s6, s14 +; GFX940-NEXT: s_mov_b32 s7, s15 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_3_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s18 +; GFX900-NEXT: s_mov_b32 s13, s19 +; GFX900-NEXT: s_mov_b32 s14, s18 +; GFX900-NEXT: s_mov_b32 s15, s19 +; GFX900-NEXT: s_mov_b32 s16, s10 +; GFX900-NEXT: s_mov_b32 s17, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s18 +; GFX90A-NEXT: s_mov_b32 s13, s19 +; GFX90A-NEXT: s_mov_b32 s14, s18 +; GFX90A-NEXT: s_mov_b32 s15, s19 +; GFX90A-NEXT: s_mov_b32 s16, s10 +; GFX90A-NEXT: s_mov_b32 s17, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s14 +; GFX940-NEXT: s_mov_b32 s9, s15 +; GFX940-NEXT: s_mov_b32 s10, s14 +; GFX940-NEXT: s_mov_b32 s11, s15 +; GFX940-NEXT: s_mov_b32 s12, s6 +; GFX940-NEXT: s_mov_b32 s13, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_4_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s4 +; GFX900-NEXT: s_mov_b32 s17, s5 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s4 +; GFX90A-NEXT: s_mov_b32 s17, s5 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_5_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s10 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s10 +; GFX900-NEXT: s_mov_b32 s15, s11 +; GFX900-NEXT: s_mov_b32 s16, s6 +; GFX900-NEXT: s_mov_b32 s17, s7 +; GFX900-NEXT: s_mov_b32 s18, s10 +; GFX900-NEXT: s_mov_b32 s19, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:19] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s10 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s10 +; GFX90A-NEXT: s_mov_b32 s15, s11 +; GFX90A-NEXT: s_mov_b32 s16, s6 +; GFX90A-NEXT: s_mov_b32 s17, s7 +; GFX90A-NEXT: s_mov_b32 s18, s10 +; GFX90A-NEXT: s_mov_b32 s19, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:19] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s6 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s6 +; GFX940-NEXT: s_mov_b32 s11, s7 +; GFX940-NEXT: s_mov_b32 s12, s2 +; GFX940-NEXT: s_mov_b32 s13, s3 +; GFX940-NEXT: s_mov_b32 s14, s6 +; GFX940-NEXT: s_mov_b32 s15, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v4p0__7_7_6_7() { +; GFX900-LABEL: s_shuffle_v4p0_v4p0__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v4p0__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v4p0__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr> asm "; def $0", "=s"() + %vec1 = call <4 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr> %vec0, <4 x ptr> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:17]}"(<4 x ptr> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll new file mode 100644 index 0000000000000..64acacc1f9148 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll @@ -0,0 +1,6434 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4p3_v2p3__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p3_v2p3__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> poison + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p3_v2p3__2_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> zeroinitializer + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p3_v2p3__u_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p3_v2p3__2_2_2_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v2, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v2, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:1] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v2p3__3_3_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v2p3__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v2p3__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v2p3__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v4p3_v2p3__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_0_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_1_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_2_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_0_u() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_1_u() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_2_u() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_3_u() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_3_0() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_3_1() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_3_2() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_u_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_1_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_2_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_u_0() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_1_0() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_2_0() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_u_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_0_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_2_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_u_1() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_0_1() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_2_1() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_u_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_0_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_1_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_u_2() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_0_2() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_1_2() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_u_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_0_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_1_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_2_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_u_3() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_0_3() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_1_3() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:1] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v2p3__3_3_2_3() { +; GFX900-LABEL: s_shuffle_v4p3_v2p3__3_3_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[6:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v2p3__3_3_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v2p3__3_3_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[2:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <2 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <2 x ptr addrspace(3)> %vec0, <2 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll new file mode 100644 index 0000000000000..586c0edc8d9be --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll @@ -0,0 +1,14014 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4p3_v3p3__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p3_v3p3__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> poison + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p3_v3p3__3_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> zeroinitializer + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v0, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v9, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v9, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v9, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v9, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[5:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v6 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:8] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v8 +; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[6:8] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v0, v8 +; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p3_v3p3__u_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v3, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v3, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p3_v3p3__3_3_3_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:2] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:2] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v3p3__5_5_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:4] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v3p3__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:4] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v4p3_v3p3__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_0_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_1_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_2_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_3_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_4_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_0_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_1_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_2_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_3_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_4_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_5_u() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_5_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_5_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_5_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_5_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_5_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_u_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_1_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_2_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_3_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s12 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s12 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s8 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_4_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_u_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_1_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_2_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_3_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_4_0() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s13 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s13 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s9 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_u_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_0_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_2_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_3_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s12 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s12 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s8 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_4_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_u_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_0_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_2_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_3_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_4_1() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s13 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s13 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s9 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_u_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_0_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_1_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_3_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_4_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_u_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_0_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_1_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_3_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_4_2() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s13 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s13 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s9 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_u_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_0_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s12 +; GFX900-NEXT: s_mov_b32 s7, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s12 +; GFX90A-NEXT: s_mov_b32 s7, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s8 +; GFX940-NEXT: s_mov_b32 s3, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_1_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_2_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_4_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_u_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_0_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_1_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s12 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s12 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s8 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_2_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_4_3() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__u_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__4_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_u_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_0_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s13 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s13 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s9 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_1_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_2_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_3_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_u_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_0_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_1_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s9 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_2_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_3_4() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_u_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_0_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_1_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_2_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_3_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_4_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_u_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_0_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_1_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[12:14] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s14 +; GFX900-NEXT: s_mov_b32 s5, s14 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s14 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[12:14] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s14 +; GFX90A-NEXT: s_mov_b32 s5, s14 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s14 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[8:10] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s10 +; GFX940-NEXT: s_mov_b32 s1, s10 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s10 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_2_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:6] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:2] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_3_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v3p3__5_5_4_5() { +; GFX900-LABEL: s_shuffle_v4p3_v3p3__5_5_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:10] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v3p3__5_5_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:10] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v3p3__5_5_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:6] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <3 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr addrspace(3)> %vec0, <3 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll new file mode 100644 index 0000000000000..f5f0cef0b06aa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll @@ -0,0 +1,24149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s + + +define void @v_shuffle_v4p3_v4p3__u_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p3_v4p3__u_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> poison + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__0_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__1_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__2_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__3_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__4_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p3_v4p3__4_u_u_u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__5_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__6_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_u_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_0_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_1_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_2_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_3_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_4_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_5_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_6_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_u_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_0_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_1_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_2_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_3_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_4_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_5_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v5, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx4 v5, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_6_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_7_u(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_7_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_7_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_7_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_7_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_7_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_7_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_7_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__u_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__0_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> zeroinitializer + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__1_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__2_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__3_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__4_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__5_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__6_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_0_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_u_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_1_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_2_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_3_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_4_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_5_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_6_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_0_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_u_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_1_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_2_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_3_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_4_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_5_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_6_0(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__u_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__0_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__1_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__2_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__3_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__4_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__5_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__6_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_1_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_u_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_0_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_2_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_3_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v7 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v7 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_4_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_5_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_6_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_1_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_u_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_0_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_2_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_3_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_4_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_5_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_6_1(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__u_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__0_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__1_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__2_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__3_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__4_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__5_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__6_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_2_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_u_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_0_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_1_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_3_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_4_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_5_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_6_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_2_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_u_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_0_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_1_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_3_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_4_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v4 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v4 +; GFX940-NEXT: v_mov_b32_e32 v9, v2 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_5_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v7, v4 +; GFX900-NEXT: v_mov_b32_e32 v8, v2 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v2 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: v_mov_b32_e32 v8, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v2 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_6_2(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__u_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__0_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__1_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__2_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__3_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__4_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__5_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__6_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v6 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v6 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_3_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_u_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_0_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_1_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_2_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_4_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_5_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_6_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_3_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_u_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_0_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_1_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_2_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_4_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_5_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_6_3(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__u_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p3_v4p3__u_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__0_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__1_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__2_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__3_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__4_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX9-LABEL: v_shuffle_v4p3_v4p3__4_4_4_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__5_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__6_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_4_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_u_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_0_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_1_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_2_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_3_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_5_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_6_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_4_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_u_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_0_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_1_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_2_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_3_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_5_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_6_4(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__u_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__0_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__1_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__2_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__3_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__4_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__5_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__6_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_5_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_u_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_0_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_1_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_2_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_3_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_4_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_6_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_5_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_u_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v4, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v4, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_0_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_1_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_2_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_3_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_4_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_6_5(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: v_mov_b32_e32 v7, v1 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v1 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v1 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__u_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__0_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__1_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__2_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__3_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v6 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v6 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__4_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__5_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__6_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_6_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_u_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_0_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_1_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v4 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_2_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_3_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v6 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_4_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v0 +; GFX900-NEXT: v_mov_b32_e32 v5, v2 +; GFX900-NEXT: v_mov_b32_e32 v6, v2 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_5_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_6_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_u_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_0_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v9, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, v4 +; GFX900-NEXT: v_mov_b32_e32 v6, v4 +; GFX900-NEXT: v_mov_b32_e32 v7, v0 +; GFX900-NEXT: v_mov_b32_e32 v8, v3 +; GFX900-NEXT: global_store_dwordx4 v9, v[5:8], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v0 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v4 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_1_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v10, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, v5 +; GFX900-NEXT: v_mov_b32_e32 v7, v5 +; GFX900-NEXT: v_mov_b32_e32 v8, v1 +; GFX900-NEXT: v_mov_b32_e32 v9, v4 +; GFX900-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, v5 +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_mov_b32_e32 v8, v1 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v10, v[6:9], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v10, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, v1 +; GFX940-NEXT: v_mov_b32_e32 v6, v5 +; GFX940-NEXT: v_mov_b32_e32 v7, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v4 +; GFX940-NEXT: global_store_dwordx4 v10, v[6:9], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_2_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_3_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v7 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: v_mov_b32_e32 v3, v6 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v6 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_4_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_5_6(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v2 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v2 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v2 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__u_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__0_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__1_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v1 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__2_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: v_mov_b32_e32 v4, v6 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__3_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__4_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__5_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__6_7_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_u_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_0_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: v_mov_b32_e32 v3, v4 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_1_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_2_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: v_mov_b32_e32 v4, v2 +; GFX900-NEXT: v_mov_b32_e32 v5, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[3:6], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_3_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v7 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_4_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v0 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v0 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v0 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_5_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_6_7_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: v_mov_b32_e32 v2, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_u_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_0_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[1:4] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v5, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, v4 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: global_store_dwordx4 v5, v[1:4], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_1_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[2:5] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v5 +; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v4, v1 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[2:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_2_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[3:6] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v7, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: v_mov_b32_e32 v3, v6 +; GFX900-NEXT: global_store_dwordx4 v7, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v0, v7 +; GFX90A-NEXT: v_mov_b32_e32 v1, v7 +; GFX90A-NEXT: v_mov_b32_e32 v3, v7 +; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: v_mov_b32_e32 v1, v7 +; GFX940-NEXT: v_mov_b32_e32 v3, v7 +; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_3_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v7 +; GFX900-NEXT: v_mov_b32_e32 v5, v7 +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v7 +; GFX90A-NEXT: v_mov_b32_e32 v5, v7 +; GFX90A-NEXT: v_mov_b32_e32 v6, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, v7 +; GFX940-NEXT: v_mov_b32_e32 v5, v7 +; GFX940-NEXT: v_mov_b32_e32 v6, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_4_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v0 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v0 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v0 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_5_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v8, 0 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_mov_b32_e32 v5, v3 +; GFX900-NEXT: v_mov_b32_e32 v6, v1 +; GFX900-NEXT: v_mov_b32_e32 v7, v3 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_mov_b32_e32 v6, v1 +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_mov_b32_e32 v6, v1 +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @v_shuffle_v4p3_v4p3__7_7_6_7(ptr addrspace(1) inreg %ptr) { +; GFX900-LABEL: v_shuffle_v4p3_v4p3__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def v[0:3] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_mov_b32_e32 v4, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: v_shuffle_v4p3_v4p3__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: v_shuffle_v4p3_v4p3__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def v[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v4, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v3 +; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + store <4 x ptr addrspace(3)> %shuf, ptr addrspace(1) %ptr, align 16 + ret void +} + +define void @s_shuffle_v4p3_v4p3__u_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__u_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__u_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__u_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> poison + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__0_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__0_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__0_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__0_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__1_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__2_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__3_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__3_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__3_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__3_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__5_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__6_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__6_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__6_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__6_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_0_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_0_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_0_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_0_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_1_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_1_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_1_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_1_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_2_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_2_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_2_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_2_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_3_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_3_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_3_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_3_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_4_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_4_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_4_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_4_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_5_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_6_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_6_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_6_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_6_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_u_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_0_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_0_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_0_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_0_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_1_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_1_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_1_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_1_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_2_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_2_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_2_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_2_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_3_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_3_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_3_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_3_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_4_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_4_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_4_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_4_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_5_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_5_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_5_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_5_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_6_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_6_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_6_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_6_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_7_u() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_7_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_7_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_7_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_7_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_7_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_7_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_7_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_7_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_7_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_7_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_7_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_7_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_7_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_7_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_7_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_7_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_7_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_7_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_7_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_7_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_7_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_7_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_7_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_7_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_7_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_7_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_7_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_7_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_7_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_7_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_7_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__u_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__0_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__0_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__0_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__0_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> zeroinitializer + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__1_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__1_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__1_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__1_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__2_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__2_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__2_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__2_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__3_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__4_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__4_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_mov_b32 s6, s4 +; GFX900-NEXT: s_mov_b32 s7, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__4_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s4 +; GFX90A-NEXT: s_mov_b32 s6, s4 +; GFX90A-NEXT: s_mov_b32 s7, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__4_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s0 +; GFX940-NEXT: s_mov_b32 s2, s0 +; GFX940-NEXT: s_mov_b32 s3, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__5_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__5_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s9 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__5_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s9 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__5_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s5 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__6_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__6_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s10 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__6_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s10 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__6_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s6 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_0_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_u_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_u_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_1_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_1_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_1_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_1_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_2_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_2_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_2_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_2_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_3_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_3_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_3_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_3_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_4_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_4_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_4_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_4_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_5_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_5_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_5_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_5_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_6_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_6_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_6_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_6_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_0_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_u_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_u_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_1_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_1_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_1_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_1_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_2_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_2_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_2_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_2_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_3_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_3_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_3_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_3_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_4_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_4_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_4_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_4_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_5_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_5_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: s_mov_b32 s15, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_5_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: s_mov_b32 s15, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_5_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_6_0() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_6_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_6_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_6_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__u_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__u_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__u_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__u_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__0_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__0_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__0_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__0_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__1_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__1_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__1_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__1_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__2_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__2_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__2_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__2_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__3_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__3_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__3_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__3_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__4_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__4_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__4_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__4_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__5_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__5_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__5_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__5_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__6_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__6_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__6_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__6_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_1_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_1_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_1_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_1_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_u_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_u_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_0_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_0_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_0_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_0_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_2_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_2_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_2_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_2_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_3_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_3_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_3_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_3_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_4_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_4_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s8 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_4_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s8 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_4_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s4 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_5_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_5_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_5_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_5_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_6_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_6_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_6_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_6_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_1_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_1_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_1_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_1_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_u_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_u_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_0_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_0_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_0_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_0_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_2_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_2_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_2_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_2_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_3_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_3_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_3_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_3_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_4_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_4_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_4_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_4_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_5_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_5_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_5_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_5_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_6_1() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_6_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_6_1: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_6_1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__u_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__u_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__u_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__u_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__0_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__0_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__0_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__0_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__1_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__1_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__1_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__1_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__2_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__2_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__2_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__2_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__3_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__3_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__3_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__3_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__4_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__4_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__4_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__4_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__5_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__5_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__5_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__5_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__6_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__6_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__6_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__6_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_2_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_2_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_2_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_2_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_u_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_u_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_0_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_0_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_0_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_0_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_1_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_1_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_1_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_1_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_3_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_3_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_3_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_3_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_4_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_4_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_4_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_4_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_5_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_5_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_5_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_5_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_6_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_6_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_6_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_6_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_2_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_2_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_2_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_2_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_u_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_u_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_0_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_0_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_0_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_0_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_1_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_1_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_1_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_1_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_3_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_3_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_3_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_3_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_4_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_4_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_4_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_4_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_5_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_5_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: s_mov_b32 s15, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_5_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: s_mov_b32 s15, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_5_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_6_2() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_6_2: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_6_2: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_6_2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__u_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__u_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__u_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__u_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__0_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__0_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__0_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__0_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__1_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__2_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__3_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__3_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__3_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__3_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__4_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__4_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__4_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__4_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__5_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__5_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s9 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__5_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s9 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__5_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s5 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__6_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__6_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s10 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__6_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s10 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__6_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s6 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_3_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_u_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_u_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_0_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_0_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_0_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_0_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_1_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_1_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_1_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_1_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_2_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_2_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_2_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_2_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_4_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_4_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s8 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_4_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s8 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_4_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s4 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_5_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_5_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_5_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_5_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_6_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_6_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_6_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_6_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_3_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_u_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_u_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_0_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_0_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_0_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_0_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_1_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_1_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_1_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_1_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_2_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_2_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_2_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_2_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_4_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_4_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_4_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_4_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_5_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_5_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_5_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_5_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_6_3() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_6_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_6_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_6_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__u_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__u_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__u_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__u_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__0_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__0_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__0_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__0_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__1_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__1_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__1_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__1_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__2_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__2_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__2_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__2_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__3_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__3_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__3_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__3_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__4_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__4_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__4_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__4_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__5_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__5_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__5_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__5_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__6_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__6_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__6_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__6_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_4_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_4_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_4_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_4_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_u_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_u_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_0_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_0_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s8 +; GFX900-NEXT: s_mov_b32 s15, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_0_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s8 +; GFX90A-NEXT: s_mov_b32 s15, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_0_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_1_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_1_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_1_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_1_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_2_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_2_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_2_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_2_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_3_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_3_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s8 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_3_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s8 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_3_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s4 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_5_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_5_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s5 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_5_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s5 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_5_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s1 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_6_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_6_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_6_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_6_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_4_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_4_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_4_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_4_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_u_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_u_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_0_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_0_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_0_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_0_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_1_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_1_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: s_mov_b32 s15, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_1_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: s_mov_b32 s15, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_1_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_2_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_2_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_2_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_2_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_3_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_3_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s8 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_3_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s8 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_3_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s4 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_5_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_5_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_5_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_5_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_6_4() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_6_4: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_6_4: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_6_4: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__u_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__u_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__u_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__u_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__0_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__0_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s9 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__0_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s9 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__0_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s5 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__1_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__1_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__1_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__1_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__2_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__2_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__2_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__2_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__3_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__3_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s10, s9 +; GFX900-NEXT: s_mov_b32 s11, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__3_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s10, s9 +; GFX90A-NEXT: s_mov_b32 s11, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__3_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s6, s5 +; GFX940-NEXT: s_mov_b32 s7, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__4_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__4_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__4_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__4_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__5_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__5_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__5_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__5_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__6_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__6_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__6_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__6_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_5_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_5_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_5_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_5_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_u_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s5 +; GFX900-NEXT: s_mov_b32 s7, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s5 +; GFX90A-NEXT: s_mov_b32 s7, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_u_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s1 +; GFX940-NEXT: s_mov_b32 s3, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_0_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_0_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s4 +; GFX900-NEXT: s_mov_b32 s14, s9 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_0_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s4 +; GFX90A-NEXT: s_mov_b32 s14, s9 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_0_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s0 +; GFX940-NEXT: s_mov_b32 s10, s5 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_1_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_1_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_1_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_1_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_2_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_2_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_2_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_2_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_3_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_3_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s9 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_3_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s9 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_3_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s5 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_4_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_4_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_4_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_4_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_6_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_6_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_6_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_6_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_5_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_5_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_5_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_5_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_u_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_0_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_0_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_0_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_0_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_1_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_1_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: s_mov_b32 s15, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_1_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: s_mov_b32 s15, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_1_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_2_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_2_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_2_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_2_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_3_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_3_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s9 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_3_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s9 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_3_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_4_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_4_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_4_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_4_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_6_5() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_6_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_6_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_6_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__u_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__u_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__u_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__u_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__0_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__0_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s10 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__0_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s10 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__0_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s6 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__1_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__1_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__1_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__1_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__2_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__2_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__2_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__2_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__3_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__3_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s10 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__3_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s10 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__3_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s6 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__4_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__4_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__4_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__4_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__5_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__5_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__5_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__5_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__6_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__6_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__6_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__6_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_6_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_6_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_6_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_6_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_u_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_u_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_0_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_0_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_0_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_0_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_1_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_1_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s10 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_1_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s10 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_1_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s6 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_2_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_2_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_2_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_2_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_3_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_3_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s11, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_3_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s11, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_3_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s7, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_4_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_4_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s6 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_4_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s6 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_4_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s2 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_5_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_5_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_5_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_5_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_6_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_6_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_6_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_6_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_u_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s7, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_u_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s3, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_0_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_0_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_0_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_0_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_1_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_1_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s12, s11 +; GFX900-NEXT: s_mov_b32 s13, s11 +; GFX900-NEXT: s_mov_b32 s14, s5 +; GFX900-NEXT: s_mov_b32 s15, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[12:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_1_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s12, s11 +; GFX90A-NEXT: s_mov_b32 s13, s11 +; GFX90A-NEXT: s_mov_b32 s14, s5 +; GFX90A-NEXT: s_mov_b32 s15, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[12:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_1_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s7 +; GFX940-NEXT: s_mov_b32 s9, s7 +; GFX940-NEXT: s_mov_b32 s10, s1 +; GFX940-NEXT: s_mov_b32 s11, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:11] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_2_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_2_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_2_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_2_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_3_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_3_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: s_mov_b32 s7, s10 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_3_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: s_mov_b32 s7, s10 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_3_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: s_mov_b32 s3, s6 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_4_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_4_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_4_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_4_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_5_6() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_5_6: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s6 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_5_6: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s6 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_5_6: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s2 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__u_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__u_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__u_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__u_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__0_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__0_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__0_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__0_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__1_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__1_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s5 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__1_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s5 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__1_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s1 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__2_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__2_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__2_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__2_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s2 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__3_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__3_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__3_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__3_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__4_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__4_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__4_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__4_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__5_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__5_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s5 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__5_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s5 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__5_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s1 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__6_7_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__6_7_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s6 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__6_7_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s6 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__6_7_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s2 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_u_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_u_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_u_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_u_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_0_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_0_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_0_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_0_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_1_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_1_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s6, s11 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_1_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s6, s11 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_1_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s2, s7 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_2_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_2_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s6 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_2_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s6 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_2_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s2 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_3_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_3_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_3_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_3_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_4_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_4_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s4 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_4_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s4 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_4_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s0 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_5_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_5_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_5_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_5_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_6_7_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_6_7_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s6 +; GFX900-NEXT: s_mov_b32 s6, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_6_7_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s6 +; GFX90A-NEXT: s_mov_b32 s6, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_6_7_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s2 +; GFX940-NEXT: s_mov_b32 s2, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_u_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_u_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_u_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_u_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_0_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_0_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_0_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_0_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_1_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_1_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_1_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_1_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_2_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_2_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s11 +; GFX900-NEXT: s_mov_b32 s5, s11 +; GFX900-NEXT: s_mov_b32 s7, s11 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_2_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s11 +; GFX90A-NEXT: s_mov_b32 s5, s11 +; GFX90A-NEXT: s_mov_b32 s7, s11 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_2_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s7 +; GFX940-NEXT: s_mov_b32 s1, s7 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_3_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_3_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s11 +; GFX900-NEXT: s_mov_b32 s9, s11 +; GFX900-NEXT: s_mov_b32 s10, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_3_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s11 +; GFX90A-NEXT: s_mov_b32 s9, s11 +; GFX90A-NEXT: s_mov_b32 s10, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_3_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s7 +; GFX940-NEXT: s_mov_b32 s5, s7 +; GFX940-NEXT: s_mov_b32 s6, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_4_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_4_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_4_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_4_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s0 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_5_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_5_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s7 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: s_mov_b32 s10, s5 +; GFX900-NEXT: s_mov_b32 s11, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:11] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_5_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s7 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: s_mov_b32 s10, s5 +; GFX90A-NEXT: s_mov_b32 s11, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:11] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_5_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: s_mov_b32 s5, s3 +; GFX940-NEXT: s_mov_b32 s6, s1 +; GFX940-NEXT: s_mov_b32 s7, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} + +define void @s_shuffle_v4p3_v4p3__7_7_6_7() { +; GFX900-LABEL: s_shuffle_v4p3_v4p3__7_7_6_7: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s4, s7 +; GFX900-NEXT: s_mov_b32 s5, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[4:7] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p3_v4p3__7_7_6_7: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s4, s7 +; GFX90A-NEXT: s_mov_b32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[4:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p3_v4p3__7_7_6_7: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s0, s3 +; GFX940-NEXT: s_mov_b32 s1, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[0:3] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %vec1 = call <4 x ptr addrspace(3)> asm "; def $0", "=s"() + %shuf = shufflevector <4 x ptr addrspace(3)> %vec0, <4 x ptr addrspace(3)> %vec1, <4 x i32> + call void asm sideeffect "; use $0", "{s[10:13]}"(<4 x ptr addrspace(3)> %shuf) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX90APLUS: {{.*}}